Complete python escape sequences aware strings

Fixes #3169
7 years ago · 36aab0f4b2
parent 6089631a1b
commit 36aab0f4b2
11 changed files with 125 additions and 19 deletions
--- a/docs/markdown/Syntax.md
+++ b/docs/markdown/Syntax.md
@ -90,8 +90,24 @@ single quote do it like this:
 single quote = 'contains a \' character'
 ```

-Similarly `\n` gets converted to a newline and `\\` to a single
-backslash.
+The full list of escape sequences is:
+
+* `\\` Backslash
+* `\'` Single quote
+* `\a` Bell
+* `\b` Backspace
+* `\f` Formfeed
+* `\n` Newline
+* `\r` Carriage Return
+* `\t` Horizontal Tab
+* `\v` Vertical Tab
+* `\ooo` Character with octal value ooo
+* `\xhh` Character with hex value hh
+* `\uxxxx` Character with 16-bit hex value xxxx
+* `\Uxxxxxxxx` Character with 32-bit hex value xxxxxxxx
+* `\N{name}` Character named name in Unicode database
+
+As in python and C, up to three octal digits are accepted in `\ooo`.

 #### String concatenation

--- a/mesonbuild/mparser.py
+++ b/mesonbuild/mparser.py
@ -13,9 +13,36 @@
 # limitations under the License.

 import re
+import codecs
 from .mesonlib import MesonException
 from . import mlog

+# This is the regex for the supported escape sequences of a regular string
+# literal, like 'abc\x00'
+ESCAPE_SEQUENCE_SINGLE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'abfnrtv]   # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+# This is the regex for the supported escape sequences of a multiline string
+# literal, like '''abc\x00'''. The only difference is that single quote (')
+# doesn't require escaping.
+ESCAPE_SEQUENCE_MULTI_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\abfnrtv]    # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE)
+
+def decode_match(match):
+    return codecs.decode(match.group(0), 'unicode_escape')
+
 class ParseException(MesonException):
    def __init__(self, text, line, lineno, colno):
        # Format as error message, followed by the line with the error, followed by a caret to show the error column.
@ -112,7 +139,6 @@ class Lexer:
        par_count = 0
        bracket_count = 0
        col = 0
-        newline_rx = re.compile(r'(?<!\\)((?:\\\\)*)\\n')
        while loc < len(self.code):
            matched = False
            value = None
@ -145,12 +171,12 @@ class Lexer:
                        if match_text.find("\n") != -1:
                            mlog.warning("""Newline character in a string detected, use ''' (three single quotes) for multiline strings instead.
 This will become a hard error in a future Meson release.""", self.getline(line_start), lineno, col)
-                        value = match_text[1:-1].replace(r"\'", "'")
-                        value = newline_rx.sub(r'\1\n', value)
-                        value = value.replace(r" \\ ".strip(), r" \ ".strip())
+                        value = match_text[1:-1]
+                        value = ESCAPE_SEQUENCE_SINGLE_RE.sub(decode_match, value)
                    elif tid == 'multiline_string':
                        tid = 'string'
                        value = match_text[3:-3]
+                        value = ESCAPE_SEQUENCE_MULTI_RE.sub(decode_match, value)
                        lines = match_text.split('\n')
                        if len(lines) > 1:
                            lineno += len(lines) - 1
--- a/unicode/file.c.in
+++ b/unicode/file.c.in
@ -0,0 +1,5 @@
+#include<stdio.h>
+const char* does_it_work() {
+    printf("{NAME}\n");
+    return "yes it does";
+}
--- a/cases/common/190
+++ b/cases/common/190
@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+
+with open(sys.argv[1]) as fh:
+    content = fh.read().replace("{NAME}", sys.argv[2])
+
+with open(os.path.join(sys.argv[3]), 'w') as fh:
+    fh.write(content)
--- a/cases/common/190
+++ b/cases/common/190
@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+for fh in os.listdir('.'):
+    if os.path.isfile(fh):
+        if fh.endswith('.c'):
+            sys.stdout.write(fh + '\0')
--- a/cases/common/190
+++ b/cases/common/190
@ -0,0 +1,3 @@
+int a_fun() {
+    return 1;
+}
--- a/cases/common/190
+++ b/cases/common/190
@ -0,0 +1,12 @@
+#include <string.h>
+
+const char* does_it_work();
+
+int a_fun();
+
+int main() {
+    if(strcmp(does_it_work(), "yes it does") != 0) {
+        return -a_fun();
+    }
+    return 0;
+}
--- a/unicode/meson.build
+++ b/unicode/meson.build
@ -0,0 +1,25 @@
+project('180 escape', 'c')
+
+gen = generator(find_program('file.py'), arguments:['@INPUT@', 'erd\u0151', '@OUTPUT@'], output: '@BASENAME@')
+
+gen_file = gen.process('file.c.in')
+
+find_file_list = run_command(find_program('find.py'))
+assert(find_file_list.returncode() == 0, 'Didn\'t find any files.')
+
+# Strings should support both octal \ooo and hex \xhh encodings
+
+found_files_oct = []
+foreach l : find_file_list.stdout().strip('\0').split('\000')
+  found_files_oct += [files(l)]
+endforeach
+
+test('first', executable('first', found_files_oct + [gen_file]))
+
+found_files_hex = []
+foreach l : find_file_list.stdout().strip('\x00').split('\x00')
+  found_files_hex += [files(l)]
+endforeach
+
+test('second', executable('second', found_files_hex + [gen_file]))
+
--- a/compile/meson.build
+++ b/compile/meson.build
@ -1,11 +1,11 @@
 project('try compile', 'c', 'cpp')

 code = '''#include<stdio.h>
-void func() { printf("Something.\n"); }
+void func() { printf("Something.\\n"); }
 '''

 breakcode = '''#include<nonexisting.h>
-void func() { printf("This won't work.\n"); }
+void func() { printf("This won't work.\\n"); }
 '''

 foreach compiler : [meson.get_compiler('c'), meson.get_compiler('cpp')]
--- a/tryrun/meson.build
+++ b/tryrun/meson.build
@ -13,8 +13,8 @@ endif

 ok_code = '''#include<stdio.h>
 int main(int argc, char **argv) {
-  printf("%s\n", "stdout");
-  fprintf(stderr, "%s\n", "stderr");
+  printf("%s\\n", "stdout");
+  fprintf(stderr, "%s\\n", "stderr");
  return 0;
 }
 '''
--- a/operations/meson.build
+++ b/operations/meson.build
@ -77,21 +77,21 @@ assert('"1.1.20"'.strip('"') == '1.1.20', '" badly stripped')
 assert('"1.1.20"'.strip('".') == '1.1.20', '". badly stripped')
 assert('"1.1.20"   '.strip('" ') == '1.1.20', '". badly stripped')

-bs_b = '''\b'''
-bs_bs_b = '''\\b'''
+bs_c = '''\c'''
+bs_bs_c = '''\\\c'''
 nl = '''
 '''
-bs_n = '''\n'''
+bs_n = '''\\n'''
 bs_nl = '''\
 '''
-bs_bs_n = '''\\n'''
-bs_bs_nl = '''\\
+bs_bs_n = '''\\\\n'''
+bs_bs_nl = '''\\\\
 '''

-assert('\b' == bs_b, 'Single backslash broken')
-assert('\\b' == bs_b, 'Double backslash broken')
-assert('\\\b' == bs_bs_b, 'Three backslash broken')
-assert('\\\\b' == bs_bs_b, 'Four backslash broken')
+assert('\c' == bs_c, 'Single backslash broken')
+assert('\\c' == bs_c, 'Double backslash broken')
+assert('\\\c' == bs_bs_c, 'Three backslash broken')
+assert('\\\\c' == bs_bs_c, 'Four backslash broken')
 assert('\n' == nl, 'Newline escape broken')
 assert('\\n' == bs_n, 'Double backslash broken before n')
 assert('\\\n' == bs_nl, 'Three backslash broken before n')