Rename YARP filepaths to prism filepaths

author: Kevin Newton <kddnewton@gmail.com> 2023-09-27 12:22:36 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2023-09-27 13:57:38 -0400
commit: 8ab56869a64fdccc094f4a83c6367fb23b72d38b (patch)
tree: 46ef2bd5c51d5b7f923eda6a60edefc7a08200db /prism/unescape.c
parent: 7e0971eb5d679bb6219abb0ec238139aa6502c5a (diff)
download: ruby-8ab56869a64fdccc094f4a83c6367fb23b72d38b.tar.gz
1 files changed, 637 insertions, 0 deletions
diff --git a/prism/unescape.c b/prism/unescape.c
new file mode 100644
index 0000000000..6ecb8f49c4
--- /dev/null
+++ b/prism/unescape.c
@@ -0,0 +1,637 @@
+#include "yarp.h"
+
+/******************************************************************************/
+/* Character checks                                                           */
+/******************************************************************************/
+
+static inline bool
+yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
+    for (size_t index = 0; index < length; index++) {
+        if (!yp_char_is_hexadecimal_digit(string[index])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// We don't call the char_width function unless we have to because it's
+// expensive to go through the indirection of the function pointer. Instead we
+// provide a fast path that will check if we can just return 1.
+static inline size_t
+yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
+    if (parser->encoding_changed || (*start >= 0x80)) {
+        return parser->encoding.char_width(start, end - start);
+    } else {
+        return 1;
+    }
+}
+
+/******************************************************************************/
+/* Lookup tables for characters                                               */
+/******************************************************************************/
+
+// This is a lookup table for unescapes that only take up a single character.
+static const uint8_t unescape_chars[] = {
+    ['\''] = '\'',
+    ['\\'] = '\\',
+    ['a'] = '\a',
+    ['b'] = '\b',
+    ['e'] = '\033',
+    ['f'] = '\f',
+    ['n'] = '\n',
+    ['r'] = '\r',
+    ['s'] = ' ',
+    ['t'] = '\t',
+    ['v'] = '\v'
+};
+
+// This is a lookup table for whether or not an ASCII character is printable.
+static const bool ascii_printable_chars[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+};
+
+static inline bool
+char_is_ascii_printable(const uint8_t b) {
+    return (b < 0x80) && ascii_printable_chars[b];
+}
+
+/******************************************************************************/
+/* Unescaping for segments                                                    */
+/******************************************************************************/
+
+// Scan the 1-3 digits of octal into the value. Returns the number of digits
+// scanned.
+static inline size_t
+unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
+    *value = (uint8_t) (backslash[1] - '0');
+    if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
+        return 2;
+    }
+    *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
+    if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
+        return 3;
+    }
+    *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
+    return 4;
+}
+
+// Convert a hexadecimal digit into its equivalent value.
+static inline uint8_t
+unescape_hexadecimal_digit(const uint8_t value) {
+    return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
+}
+
+// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
+// digits scanned.
+static inline size_t
+unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
+    *value = 0;
+    if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
+        if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_HEXADECIMAL);
+        return 2;
+    }
+    *value = unescape_hexadecimal_digit(backslash[2]);
+    if (backslash + 3 >=  end || !yp_char_is_hexadecimal_digit(backslash[3])) {
+        return 3;
+    }
+    *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
+    return 4;
+}
+
+// Scan the 4 digits of a Unicode escape into the value. Returns the number of
+// digits scanned. This function assumes that the characters have already been
+// validated.
+static inline void
+unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
+    *value = 0;
+    for (size_t index = 0; index < length; index++) {
+        if (index != 0) *value <<= 4;
+        *value |= unescape_hexadecimal_digit(string[index]);
+    }
+}
+
+// Accepts the pointer to the string to write the unicode value along with the
+// 32-bit value to write. Writes the UTF-8 representation of the value to the
+// string and returns the number of bytes written.
+static inline size_t
+unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
+    if (value <= 0x7F) {
+        // 0xxxxxxx
+        dest[0] = (uint8_t) value;
+        return 1;
+    }
+
+    if (value <= 0x7FF) {
+        // 110xxxxx 10xxxxxx
+        dest[0] = (uint8_t) (0xC0 | (value >> 6));
+        dest[1] = (uint8_t) (0x80 | (value & 0x3F));
+        return 2;
+    }
+
+    if (value <= 0xFFFF) {
+        // 1110xxxx 10xxxxxx 10xxxxxx
+        dest[0] = (uint8_t) (0xE0 | (value >> 12));
+        dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
+        dest[2] = (uint8_t) (0x80 | (value & 0x3F));
+        return 3;
+    }
+
+    // At this point it must be a 4 digit UTF-8 representation. If it's not, then
+    // the input is invalid.
+    if (value <= 0x10FFFF) {
+        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        dest[0] = (uint8_t) (0xF0 | (value >> 18));
+        dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
+        dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
+        dest[3] = (uint8_t) (0x80 | (value & 0x3F));
+        return 4;
+    }
+
+    // If we get here, then the value is too big. This is an error, but we don't
+    // want to just crash, so instead we'll add an error to the error list and put
+    // in a replacement character instead.
+    if (error_list) yp_diagnostic_list_append(error_list, start, end, YP_ERR_ESCAPE_INVALID_UNICODE);
+    dest[0] = 0xEF;
+    dest[1] = 0xBF;
+    dest[2] = 0xBD;
+    return 3;
+}
+
+typedef enum {
+    YP_UNESCAPE_FLAG_NONE = 0,
+    YP_UNESCAPE_FLAG_CONTROL = 1,
+    YP_UNESCAPE_FLAG_META = 2,
+    YP_UNESCAPE_FLAG_EXPECT_SINGLE = 4
+} yp_unescape_flag_t;
+
+// Unescape a single character value based on the given flags.
+static inline uint8_t
+unescape_char(uint8_t value, const uint8_t flags) {
+    if (flags & YP_UNESCAPE_FLAG_CONTROL) {
+        value &= 0x1f;
+    }
+
+    if (flags & YP_UNESCAPE_FLAG_META) {
+        value |= 0x80;
+    }
+
+    return value;
+}
+
+// Read a specific escape sequence into the given destination.
+static const uint8_t *
+unescape(
+    yp_parser_t *parser,
+    uint8_t *dest,
+    size_t *dest_length,
+    const uint8_t *backslash,
+    const uint8_t *end,
+    const uint8_t flags,
+    yp_list_t *error_list
+) {
+    switch (backslash[1]) {
+        case 'a':
+        case 'b':
+        case 'e':
+        case 'f':
+        case 'n':
+        case 'r':
+        case 's':
+        case 't':
+        case 'v':
+            if (dest) {
+                dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
+            }
+            return backslash + 2;
+        // \nnn         octal bit pattern, where nnn is 1-3 octal digits ([0-7])
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8': case '9': {
+            uint8_t value;
+            const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
+
+            if (dest) {
+                dest[(*dest_length)++] = unescape_char(value, flags);
+            }
+            return cursor;
+        }
+        // \xnn         hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
+        case 'x': {
+            uint8_t value;
+            const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
+
+            if (dest) {
+                dest[(*dest_length)++] = unescape_char(value, flags);
+            }
+            return cursor;
+        }
+        // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
+        // \unnnn       Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
+        case 'u': {
+            if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS);
+                return backslash + 2;
+            }
+
+            if ((backslash + 3) < end && backslash[2] == '{') {
+                const uint8_t *unicode_cursor = backslash + 3;
+                const uint8_t *extra_codepoints_start = NULL;
+                int codepoints_count = 0;
+
+                unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
+
+                while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
+                    const uint8_t *unicode_start = unicode_cursor;
+                    size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
+
+                    // \u{nnnn} character literal allows only 1-6 hexadecimal digits
+                    if (hexadecimal_length > 6) {
+                        if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, YP_ERR_ESCAPE_INVALID_UNICODE_LONG);
+                    }
+                    // there are not hexadecimal characters
+                    else if (hexadecimal_length == 0) {
+                        if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, YP_ERR_ESCAPE_INVALID_UNICODE);
+                        return unicode_cursor;
+                    }
+
+                    unicode_cursor += hexadecimal_length;
+
+                    codepoints_count++;
+                    if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count == 2)
+                        extra_codepoints_start = unicode_start;
+
+                    uint32_t value;
+                    unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
+                    if (dest) {
+                        *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
+                    }
+
+                    unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
+                }
+
+                // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
+                if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
+                    if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, YP_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
+                }
+
+                if (unicode_cursor < end && *unicode_cursor == '}') {
+                    unicode_cursor++;
+                } else {
+                    if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, YP_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                }
+
+                return unicode_cursor;
+            }
+            else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
+                uint32_t value;
+                unescape_unicode(backslash + 2, 4, &value);
+
+                if (dest) {
+                    *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
+                }
+                return backslash + 6;
+            }
+
+            if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_UNICODE);
+            return backslash + 2;
+        }
+        // \c\M-x       meta control character, where x is an ASCII printable character
+        // \c?          delete, ASCII 7Fh (DEL)
+        // \cx          control character, where x is an ASCII printable character
+        case 'c':
+            if (backslash + 2 >= end) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
+                return end;
+            }
+
+            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
+                return backslash + 2;
+            }
+
+            switch (backslash[2]) {
+                case '\\':
+                    return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
+                case '?':
+                    if (dest) {
+                        dest[(*dest_length)++] = unescape_char(0x7f, flags);
+                    }
+                    return backslash + 3;
+                default: {
+                    if (!char_is_ascii_printable(backslash[2])) {
+                        if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
+                        return backslash + 2;
+                    }
+
+                    if (dest) {
+                        dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
+                    }
+                    return backslash + 3;
+                }
+            }
+        // \C-x         control character, where x is an ASCII printable character
+        // \C-?         delete, ASCII 7Fh (DEL)
+        case 'C':
+            if (backslash + 3 >= end) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
+                return end;
+            }
+
+            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
+                return backslash + 2;
+            }
+
+            if (backslash[2] != '-') {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
+                return backslash + 2;
+            }
+
+            switch (backslash[3]) {
+                case '\\':
+                    return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
+                case '?':
+                    if (dest) {
+                        dest[(*dest_length)++] = unescape_char(0x7f, flags);
+                    }
+                    return backslash + 4;
+                default:
+                    if (!char_is_ascii_printable(backslash[3])) {
+                        if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_CONTROL);
+                        return backslash + 2;
+                    }
+
+                    if (dest) {
+                        dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
+                    }
+                    return backslash + 4;
+            }
+        // \M-\C-x      meta control character, where x is an ASCII printable character
+        // \M-\cx       meta control character, where x is an ASCII printable character
+        // \M-x         meta character, where x is an ASCII printable character
+        case 'M': {
+            if (backslash + 3 >= end) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_META);
+                return end;
+            }
+
+            if (flags & YP_UNESCAPE_FLAG_META) {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_META_REPEAT);
+                return backslash + 2;
+            }
+
+            if (backslash[2] != '-') {
+                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_META);
+                return backslash + 2;
+            }
+
+            if (backslash[3] == '\\') {
+                return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
+            }
+
+            if (char_is_ascii_printable(backslash[3])) {
+                if (dest) {
+                    dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
+                }
+                return backslash + 4;
+            }
+
+            if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_META);
+            return backslash + 3;
+        }
+        // \n
+        case '\n':
+            return backslash + 2;
+        // \r
+        case '\r':
+            if (backslash + 2 < end && backslash[2] == '\n') {
+                return backslash + 3;
+            }
+        /* fallthrough */
+        // In this case we're escaping something that doesn't need escaping.
+        default: {
+            size_t width = yp_char_width(parser, backslash + 1, end);
+
+            if (dest) {
+                memcpy(dest + *dest_length, backslash + 1, width);
+                *dest_length += width;
+            }
+
+            return backslash + 1 + width;
+        }
+    }
+}
+
+/******************************************************************************/
+/* Public functions and entrypoints                                           */
+/******************************************************************************/
+
+// Unescape the contents of the given token into the given string using the
+// given unescape mode. The supported escapes are:
+//
+// \a             bell, ASCII 07h (BEL)
+// \b             backspace, ASCII 08h (BS)
+// \t             horizontal tab, ASCII 09h (TAB)
+// \n             newline (line feed), ASCII 0Ah (LF)
+// \v             vertical tab, ASCII 0Bh (VT)
+// \f             form feed, ASCII 0Ch (FF)
+// \r             carriage return, ASCII 0Dh (CR)
+// \e             escape, ASCII 1Bh (ESC)
+// \s             space, ASCII 20h (SPC)
+// \\             backslash
+// \nnn           octal bit pattern, where nnn is 1-3 octal digits ([0-7])
+// \xnn           hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
+// \unnnn         Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
+// \u{nnnn ...}   Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
+// \cx or \C-x    control character, where x is an ASCII printable character
+// \M-x           meta character, where x is an ASCII printable character
+// \M-\C-x        meta control character, where x is an ASCII printable character
+// \M-\cx         same as above
+// \c\M-x         same as above
+// \c? or \C-?    delete, ASCII 7Fh (DEL)
+//
+static void
+yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
+    if (unescape_type == YP_UNESCAPE_NONE) {
+        // If we're not unescaping then we can reference the source directly.
+        return;
+    }
+
+    const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
+
+    if (backslash == NULL) {
+        // Here there are no escapes, so we can reference the source directly.
+        return;
+    }
+
+    // Here we have found an escape character, so we need to handle all escapes
+    // within the string.
+    uint8_t *allocated = malloc(string->length);
+    if (allocated == NULL) {
+        yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, YP_ERR_MALLOC_FAILED);
+        return;
+    }
+
+    // This is the memory address where we're putting the unescaped string.
+    uint8_t *dest = allocated;
+    size_t dest_length = 0;
+
+    // This is the current position in the source string that we're looking at.
+    // It's going to move along behind the backslash so that we can copy each
+    // segment of the string that doesn't contain an escape.
+    const uint8_t *cursor = string->source;
+    const uint8_t *end = string->source + string->length;
+
+    // For each escape found in the source string, we will handle it and update
+    // the moving cursor->backslash window.
+    while (backslash != NULL && backslash + 1 < end) {
+        assert(dest_length < string->length);
+
+        // This is the size of the segment of the string from the previous escape
+        // or the start of the string to the current escape.
+        size_t segment_size = (size_t) (backslash - cursor);
+
+        // Here we're going to copy everything up until the escape into the
+        // destination buffer.
+        memcpy(dest + dest_length, cursor, segment_size);
+        dest_length += segment_size;
+
+        switch (backslash[1]) {
+            case '\\':
+            case '\'':
+                dest[dest_length++] = unescape_chars[backslash[1]];
+                cursor = backslash + 2;
+                break;
+            default:
+                if (unescape_type == YP_UNESCAPE_WHITESPACE) {
+                    if (backslash[1] == '\r' && backslash[2] == '\n') {
+                        cursor = backslash + 2;
+                        break;
+                    }
+                    if (yp_strspn_whitespace(backslash + 1, 1)) {
+                        cursor = backslash + 1;
+                        break;
+                    }
+                }
+                if (unescape_type == YP_UNESCAPE_WHITESPACE || unescape_type == YP_UNESCAPE_MINIMAL) {
+                    // In this case we're escaping something that doesn't need escaping.
+                    dest[dest_length++] = '\\';
+                    cursor = backslash + 1;
+                    break;
+                }
+
+                // This is the only type of unescaping left. In this case we need to
+                // handle all of the different unescapes.
+                assert(unescape_type == YP_UNESCAPE_ALL);
+
+                uint8_t flags = YP_UNESCAPE_FLAG_NONE;
+                if (expect_single_codepoint) {
+                    flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
+                }
+
+                cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
+                break;
+        }
+
+        if (end > cursor) {
+            backslash = yp_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding);
+        } else {
+            backslash = NULL;
+        }
+    }
+
+    // We need to copy the final segment of the string after the last escape.
+    if (end > cursor) {
+        memcpy(dest + dest_length, cursor, (size_t) (end - cursor));
+    } else {
+        cursor = end;
+    }
+
+    // If the string was already allocated, then we need to free that memory
+    // here. That's because we're about to override it with the escaped string.
+    yp_string_free(string);
+
+    // We also need to update the length at the end. This is because every escape
+    // reduces the length of the final string, and we don't want garbage at the
+    // end.
+    yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
+}
+
+YP_EXPORTED_FUNCTION void
+yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
+    yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
+}
+
+void
+yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
+    yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
+}
+
+// This function is similar to yp_unescape_manipulate_string, except it doesn't
+// actually perform any string manipulations. Instead, it calculates how long
+// the unescaped character is, and returns that value
+size_t
+yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
+    assert(unescape_type != YP_UNESCAPE_NONE);
+
+    if (backslash + 1 >= parser->end) {
+        return 0;
+    }
+
+    switch (backslash[1]) {
+        case '\\':
+        case '\'':
+            return 2;
+        default: {
+            if (unescape_type == YP_UNESCAPE_WHITESPACE) {
+                if (backslash[1] == '\r' && backslash[2] == '\n') {
+                    return 2;
+                }
+                size_t whitespace = yp_strspn_whitespace(backslash + 1, 1);
+                if (whitespace > 0) {
+                    return whitespace;
+                }
+            }
+            if (unescape_type == YP_UNESCAPE_WHITESPACE || unescape_type == YP_UNESCAPE_MINIMAL) {
+                return 1 + yp_char_width(parser, backslash + 1, parser->end);
+            }
+
+            // This is the only type of unescaping left. In this case we need to
+            // handle all of the different unescapes.
+            assert(unescape_type == YP_UNESCAPE_ALL);
+
+            uint8_t flags = YP_UNESCAPE_FLAG_NONE;
+            if (expect_single_codepoint) {
+                flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
+            }
+
+            const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
+            assert(cursor > backslash);
+
+            return (size_t) (cursor - backslash);
+        }
+    }
+}
+
+// This is one of the main entry points into the extension. It accepts a source
+// string, a type of unescaping, and a pointer to a result string. It returns a
+// boolean indicating whether or not the unescaping was successful.
+YP_EXPORTED_FUNCTION bool
+yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
+    yp_parser_t parser;
+    yp_parser_init(&parser, start, length, NULL);
+
+    yp_string_shared_init(result, start, start + length);
+    yp_unescape_manipulate_string(&parser, result, unescape_type);
+
+    bool success = yp_list_empty_p(&parser.error_list);
+    yp_parser_free(&parser);
+
+    return success;
+}
author	Kevin Newton <kddnewton@gmail.com>	2023-09-27 12:22:36 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2023-09-27 13:57:38 -0400
commit	8ab56869a64fdccc094f4a83c6367fb23b72d38b (patch)
tree	46ef2bd5c51d5b7f923eda6a60edefc7a08200db /prism/unescape.c
parent	7e0971eb5d679bb6219abb0ec238139aa6502c5a (diff)
download	ruby-8ab56869a64fdccc094f4a83c6367fb23b72d38b.tar.gz