From 36aab0f4b2a05a1ae4c8603e57b4c3c684fe8ea8 Mon Sep 17 00:00:00 2001
From: Tim 'mithro' Ansell <me@mith.ro>
Date: Sat, 3 Mar 2018 15:00:55 -0800
Subject: [PATCH] Complete python escape sequences aware strings

Fixes #3169
---
 docs/markdown/Syntax.md                       | 20 +++++++++--
 mesonbuild/mparser.py                         | 34 ++++++++++++++++---
 .../common/190 escape and unicode/file.c.in   |  5 +++
 .../common/190 escape and unicode/file.py     | 10 ++++++
 .../common/190 escape and unicode/find.py     |  9 +++++
 .../common/190 escape and unicode/fun.c       |  3 ++
 .../common/190 escape and unicode/main.c      | 12 +++++++
 .../common/190 escape and unicode/meson.build | 25 ++++++++++++++
 test cases/common/33 try compile/meson.build  |  4 +--
 test cases/common/39 tryrun/meson.build       |  4 +--
 .../common/42 string operations/meson.build   | 18 +++++-----
 11 files changed, 125 insertions(+), 19 deletions(-)
 create mode 100644 test cases/common/190 escape and unicode/file.c.in
 create mode 100644 test cases/common/190 escape and unicode/file.py
 create mode 100644 test cases/common/190 escape and unicode/find.py
 create mode 100644 test cases/common/190 escape and unicode/fun.c
 create mode 100644 test cases/common/190 escape and unicode/main.c
 create mode 100644 test cases/common/190 escape and unicode/meson.build

diff --git a/docs/markdown/Syntax.md b/docs/markdown/Syntax.md
index 1005100fe..01c8c6e95 100644
--- a/docs/markdown/Syntax.md
+++ b/docs/markdown/Syntax.md
@@ -90,8 +90,24 @@ single quote do it like this:
 single quote = 'contains a \' character'
 ```
 
-Similarly `\n` gets converted to a newline and `\\` to a single
-backslash.
+The full list of escape sequences is:
+
+* `\\` Backslash
+* `\'` Single quote
+* `\a` Bell
+* `\b` Backspace
+* `\f` Formfeed
+* `\n` Newline
+* `\r` Carriage Return
+* `\t` Horizontal Tab
+* `\v` Vertical Tab
+* `\ooo` Character with octal value ooo
+* `\xhh` Character with hex value hh
+* `\uxxxx` Character with 16-bit hex value xxxx
+* `\Uxxxxxxxx` Character with 32-bit hex value xxxxxxxx
+* `\N{name}` Character named name in Unicode database
+
+As in python and C, up to three octal digits are accepted in `\ooo`.
 
 #### String concatenation
 
diff --git a/mesonbuild/mparser.py b/mesonbuild/mparser.py
index 0e7524c3c..bf7c271d0 100644
--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@@ -13,9 +13,36 @@
 # limitations under the License.
 
 import re
+import codecs
 from .mesonlib import MesonException
 from . import mlog
 
+# This is the regex for the supported escape sequences of a regular string
+# literal, like 'abc\x00'
+ESCAPE_SEQUENCE_SINGLE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'abfnrtv]   # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+# This is the regex for the supported escape sequences of a multiline string
+# literal, like '''abc\x00'''. The only difference is that single quote (')
+# doesn't require escaping.
+ESCAPE_SEQUENCE_MULTI_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\abfnrtv]    # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+def decode_match(match):
+    return codecs.decode(match.group(0), 'unicode_escape')
+
 class ParseException(MesonException):
     def __init__(self, text, line, lineno, colno):
         # Format as error message, followed by the line with the error, followed by a caret to show the error column.
@@ -112,7 +139,6 @@ class Lexer:
         par_count = 0
         bracket_count = 0
         col = 0
-        newline_rx = re.compile(r'(?<!\\)((?:\\\\)*)\\n')
         while loc < len(self.code):
             matched = False
             value = None
@@ -145,12 +171,12 @@ class Lexer:
                         if match_text.find("\n") != -1:
                             mlog.warning("""Newline character in a string detected, use ''' (three single quotes) for multiline strings instead.
 This will become a hard error in a future Meson release.""", self.getline(line_start), lineno, col)
-                        value = match_text[1:-1].replace(r"\'", "'")
-                        value = newline_rx.sub(r'\1\n', value)
-                        value = value.replace(r" \\ ".strip(), r" \ ".strip())
+                        value = match_text[1:-1]
+                        value = ESCAPE_SEQUENCE_SINGLE_RE.sub(decode_match, value)
                     elif tid == 'multiline_string':
                         tid = 'string'
                         value = match_text[3:-3]
+                        value = ESCAPE_SEQUENCE_MULTI_RE.sub(decode_match, value)
                         lines = match_text.split('\n')
                         if len(lines) > 1:
                             lineno += len(lines) - 1
diff --git a/test cases/common/190 escape and unicode/file.c.in b/test cases/common/190 escape and unicode/file.c.in
new file mode 100644
index 000000000..413ed4297
--- /dev/null
+++ b/test cases/common/190 escape and unicode/file.c.in	
@@ -0,0 +1,5 @@
+#include<stdio.h>
+const char* does_it_work() {
+    printf("{NAME}\n");
+    return "yes it does";
+}
diff --git a/test cases/common/190 escape and unicode/file.py b/test cases/common/190 escape and unicode/file.py
new file mode 100644
index 000000000..af67a0950
--- /dev/null
+++ b/test cases/common/190 escape and unicode/file.py	
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+
+with open(sys.argv[1]) as fh:
+    content = fh.read().replace("{NAME}", sys.argv[2])
+
+with open(os.path.join(sys.argv[3]), 'w') as fh:
+    fh.write(content)
diff --git a/test cases/common/190 escape and unicode/find.py b/test cases/common/190 escape and unicode/find.py
new file mode 100644
index 000000000..34a3eb835
--- /dev/null
+++ b/test cases/common/190 escape and unicode/find.py	
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+for fh in os.listdir('.'):
+    if os.path.isfile(fh):
+        if fh.endswith('.c'):
+            sys.stdout.write(fh + '\0')
diff --git a/test cases/common/190 escape and unicode/fun.c b/test cases/common/190 escape and unicode/fun.c
new file mode 100644
index 000000000..8eeb8ea00
--- /dev/null
+++ b/test cases/common/190 escape and unicode/fun.c	
@@ -0,0 +1,3 @@
+int a_fun() {
+    return 1;
+}
diff --git a/test cases/common/190 escape and unicode/main.c b/test cases/common/190 escape and unicode/main.c
new file mode 100644
index 000000000..0bcde169c
--- /dev/null
+++ b/test cases/common/190 escape and unicode/main.c	
@@ -0,0 +1,12 @@
+#include <string.h>
+
+const char* does_it_work();
+
+int a_fun();
+
+int main() {
+    if(strcmp(does_it_work(), "yes it does") != 0) {
+        return -a_fun();
+    }
+    return 0;
+}
diff --git a/test cases/common/190 escape and unicode/meson.build b/test cases/common/190 escape and unicode/meson.build
new file mode 100644
index 000000000..be1107326
--- /dev/null
+++ b/test cases/common/190 escape and unicode/meson.build	
@@ -0,0 +1,25 @@
+project('180 escape', 'c')
+
+gen = generator(find_program('file.py'), arguments:['@INPUT@', 'erd\u0151', '@OUTPUT@'], output: '@BASENAME@')
+
+gen_file = gen.process('file.c.in')
+
+find_file_list = run_command(find_program('find.py'))
+assert(find_file_list.returncode() == 0, 'Didn\'t find any files.')
+
+# Strings should support both octal \ooo and hex \xhh encodings
+
+found_files_oct = []
+foreach l : find_file_list.stdout().strip('\0').split('\000')
+  found_files_oct += [files(l)]
+endforeach
+
+test('first', executable('first', found_files_oct + [gen_file]))
+
+found_files_hex = []
+foreach l : find_file_list.stdout().strip('\x00').split('\x00')
+  found_files_hex += [files(l)]
+endforeach
+
+test('second', executable('second', found_files_hex + [gen_file]))
+
diff --git a/test cases/common/33 try compile/meson.build b/test cases/common/33 try compile/meson.build
index 09ca395be..cb1037d30 100644
--- a/test cases/common/33 try compile/meson.build	
+++ b/test cases/common/33 try compile/meson.build	
@@ -1,11 +1,11 @@
 project('try compile', 'c', 'cpp')
 
 code = '''#include<stdio.h>
-void func() { printf("Something.\n"); }
+void func() { printf("Something.\\n"); }
 '''
 
 breakcode = '''#include<nonexisting.h>
-void func() { printf("This won't work.\n"); }
+void func() { printf("This won't work.\\n"); }
 '''
 
 foreach compiler : [meson.get_compiler('c'), meson.get_compiler('cpp')]
diff --git a/test cases/common/39 tryrun/meson.build b/test cases/common/39 tryrun/meson.build
index c64446ff0..daf5be798 100644
--- a/test cases/common/39 tryrun/meson.build	
+++ b/test cases/common/39 tryrun/meson.build	
@@ -13,8 +13,8 @@ endif
 
 ok_code = '''#include<stdio.h>
 int main(int argc, char **argv) {
-  printf("%s\n", "stdout");
-  fprintf(stderr, "%s\n", "stderr");
+  printf("%s\\n", "stdout");
+  fprintf(stderr, "%s\\n", "stderr");
   return 0;
 }
 '''
diff --git a/test cases/common/42 string operations/meson.build b/test cases/common/42 string operations/meson.build
index a43de707e..1c289eb1b 100644
--- a/test cases/common/42 string operations/meson.build	
+++ b/test cases/common/42 string operations/meson.build	
@@ -77,21 +77,21 @@ assert('"1.1.20"'.strip('"') == '1.1.20', '" badly stripped')
 assert('"1.1.20"'.strip('".') == '1.1.20', '". badly stripped')
 assert('"1.1.20"   '.strip('" ') == '1.1.20', '". badly stripped')
 
-bs_b = '''\b'''
-bs_bs_b = '''\\b'''
+bs_c = '''\c'''
+bs_bs_c = '''\\\c'''
 nl = '''
 '''
-bs_n = '''\n'''
+bs_n = '''\\n'''
 bs_nl = '''\
 '''
-bs_bs_n = '''\\n'''
-bs_bs_nl = '''\\
+bs_bs_n = '''\\\\n'''
+bs_bs_nl = '''\\\\
 '''
 
-assert('\b' == bs_b, 'Single backslash broken')
-assert('\\b' == bs_b, 'Double backslash broken')
-assert('\\\b' == bs_bs_b, 'Three backslash broken')
-assert('\\\\b' == bs_bs_b, 'Four backslash broken')
+assert('\c' == bs_c, 'Single backslash broken')
+assert('\\c' == bs_c, 'Double backslash broken')
+assert('\\\c' == bs_bs_c, 'Three backslash broken')
+assert('\\\\c' == bs_bs_c, 'Four backslash broken')
 assert('\n' == nl, 'Newline escape broken')
 assert('\\n' == bs_n, 'Double backslash broken before n')
 assert('\\\n' == bs_nl, 'Three backslash broken before n')