diff options
Diffstat (limited to 'vere/ext/nasm/asm/quote.c')
-rw-r--r-- | vere/ext/nasm/asm/quote.c | 563 |
1 files changed, 563 insertions, 0 deletions
diff --git a/vere/ext/nasm/asm/quote.c b/vere/ext/nasm/asm/quote.c new file mode 100644 index 0000000..074f17d --- /dev/null +++ b/vere/ext/nasm/asm/quote.c @@ -0,0 +1,563 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 1996-2020 The NASM Authors - All Rights Reserved + * See the file AUTHORS included with the NASM distribution for + * the specific copyright holders. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following + * conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * ----------------------------------------------------------------------- */ + +/* + * quote.c + */ + +#include "compiler.h" +#include "nasmlib.h" +#include "quote.h" +#include "nctype.h" +#include "error.h" + +/* + * Create a NASM quoted string in newly allocated memory. Update the + * *lenp parameter with the output length (sans final NUL). + */ + +char *nasm_quote(const char *str, size_t *lenp) +{ + const char *p, *ep; + char c, c1, *q, *nstr; + unsigned char uc; + bool sq_ok, dq_ok; + size_t qlen; + size_t len = *lenp; + + sq_ok = dq_ok = true; + ep = str+len; + qlen = 0; /* Length if we need `...` quotes */ + for (p = str; p < ep; p++) { + c = *p; + switch (c) { + case '\'': + sq_ok = false; + qlen++; + break; + case '\"': + dq_ok = false; + qlen++; + break; + case '`': + case '\\': + qlen += 2; + break; + default: + if (c < ' ' || c > '~') { + sq_ok = dq_ok = false; + switch (c) { + case '\a': + case '\b': + case '\t': + case '\n': + case '\v': + case '\f': + case '\r': + case 27: + qlen += 2; + break; + default: + c1 = (p+1 < ep) ? p[1] : 0; + if (c1 >= '0' && c1 <= '7') + uc = 0377; /* Must use the full form */ + else + uc = c; + if (uc > 077) + qlen++; + if (uc > 07) + qlen++; + qlen += 2; + break; + } + } else { + qlen++; + } + break; + } + } + + if (sq_ok || dq_ok) { + /* Use '...' or "..." */ + nstr = nasm_malloc(len+3); + nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"'; + q = &nstr[len+2]; + if (len > 0) + memcpy(nstr+1, str, len); + } else { + /* Need to use `...` quoted syntax */ + nstr = nasm_malloc(qlen+3); + q = nstr; + *q++ = '`'; + for (p = str; p < ep; p++) { + c = *p; + switch (c) { + case '`': + case '\\': + *q++ = '\\'; + *q++ = c; + break; + case 7: + *q++ = '\\'; + *q++ = 'a'; + break; + case 8: + *q++ = '\\'; + *q++ = 'b'; + break; + case 9: + *q++ = '\\'; + *q++ = 't'; + break; + case 10: + *q++ = '\\'; + *q++ = 'n'; + break; + case 11: + *q++ = '\\'; + *q++ = 'v'; + break; + case 12: + *q++ = '\\'; + *q++ = 'f'; + break; + case 13: + *q++ = '\\'; + *q++ = 'r'; + break; + case 27: + *q++ = '\\'; + *q++ = 'e'; + break; + default: + if (c < ' ' || c > '~') { + c1 = (p+1 < ep) ? p[1] : 0; + if (c1 >= '0' && c1 <= '7') + uc = 0377; /* Must use the full form */ + else + uc = c; + *q++ = '\\'; + if (uc > 077) + *q++ = ((unsigned char)c >> 6) + '0'; + if (uc > 07) + *q++ = (((unsigned char)c >> 3) & 7) + '0'; + *q++ = ((unsigned char)c & 7) + '0'; + break; + } else { + *q++ = c; + } + break; + } + } + *q++ = '`'; + nasm_assert((size_t)(q-nstr) == qlen+2); + } + *q = '\0'; + *lenp = q - nstr; + return nstr; +} + +static unsigned char *emit_utf8(unsigned char *q, uint32_t v) +{ + uint32_t vb1, vb2, vb3, vb4, vb5; + + if (v <= 0x7f) { + *q++ = v; + goto out0; + } + + vb1 = v >> 6; + if (vb1 <= 0x1f) { + *q++ = 0xc0 + vb1; + goto out1; + } + + vb2 = vb1 >> 6; + if (vb2 <= 0x0f) { + *q++ = 0xe0 + vb2; + goto out2; + } + + vb3 = vb2 >> 6; + if (vb3 <= 0x07) { + *q++ = 0xf0 + vb3; + goto out3; + } + + vb4 = vb3 >> 6; + if (vb4 <= 0x03) { + *q++ = 0xf8 + vb4; + goto out4; + } + + /* + * Note: this is invalid even for "classic" (pre-UTF16) 31-bit + * UTF-8 if the value is >= 0x8000000. This at least tries to do + * something vaguely sensible with it. Caveat programmer. + * The __utf*__ string transform functions do reject these + * as invalid input. + * + * vb5 cannot be more than 3, as a 32-bit value has been shifted + * right by 5*6 = 30 bits already. + */ + vb5 = vb4 >> 6; + *q++ = 0xfc + vb5; + goto out5; + + /* Emit extension bytes as appropriate */ +out5: *q++ = 0x80 + (vb4 & 63); +out4: *q++ = 0x80 + (vb3 & 63); +out3: *q++ = 0x80 + (vb2 & 63); +out2: *q++ = 0x80 + (vb1 & 63); +out1: *q++ = 0x80 + (v & 63); +out0: return q; +} + +static inline uint32_t ctlbit(uint32_t v) +{ + return unlikely(v < 32) ? UINT32_C(1) << v : 0; +} + +#define CTL_ERR(c) \ + (badctl & (ctlmask |= ctlbit(c))) + +#define EMIT_UTF8(c) \ + do { \ + uint32_t ec = (c); \ + if (!CTL_ERR(ec)) \ + q = emit_utf8(q, ec); \ + } while (0) + +#define EMIT(c) \ + do { \ + unsigned char ec = (c); \ + if (!CTL_ERR(ec)) \ + *q++ = ec; \ + } while (0) + +/* + * Same as nasm_quote, but take the length of a C string; + * the lenp argument is optional. + */ +char *nasm_quote_cstr(const char *str, size_t *lenp) +{ + size_t len = strlen(str); + char *qstr = nasm_quote(str, &len); + if (lenp) + *lenp = len; + return qstr; +} + +/* + * Do an *in-place* dequoting of the specified string, returning the + * resulting length (which may be containing embedded nulls.) + * + * In-place replacement is possible since the unquoted length is always + * shorter than or equal to the quoted length. + * + * *ep points to the final quote, or to the null if improperly quoted. + * + * Issue an error if the string contains control characters + * corresponding to bits set in badctl; in that case, the output + * string, but not *ep, is truncated before the first invalid + * character. + * + * badctl is a bitmask of control characters (0-31) which are forbidden + * from appearing in the final output. + * + * The qstart character can be either '`' (NASM style) or '\"' (C style), + * to indicate the lead marker of a quoted string. If it is '\"', then + * '`' is not a special character at all. + */ + +size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl, + const char qstart) +{ + unsigned char bq; + const unsigned char *p; + const unsigned char *escp = NULL; + unsigned char *q; + unsigned char c; + uint32_t ctlmask = 0; /* Mask of control characters seen */ + enum unq_state { + st_start, + st_backslash, + st_hex, + st_oct, + st_ucs, + st_done + } state; + int ndig = 0; + uint32_t nval = 0; + + p = q = (unsigned char *)str; + + bq = *p++; + if (!bq) + return 0; + + if (bq == (unsigned char)qstart) { + /* `...` string */ + state = st_start; + + while (state != st_done) { + c = *p++; + switch (state) { + case st_start: + if (c == '\\') { + state = st_backslash; + } else if ((c == '\0') | (c == bq)) { + state = st_done; + } else { + EMIT(c); + } + break; + + case st_backslash: + state = st_start; + escp = p; /* Beginning of argument sequence */ + nval = 0; + switch (c) { + case 'a': + nval = 7; + break; + case 'b': + nval = 8; + break; + case 'e': + nval = 27; + break; + case 'f': + nval = 12; + break; + case 'n': + nval = 10; + break; + case 'r': + nval = 13; + break; + case 't': + nval = 9; + break; + case 'u': + state = st_ucs; + ndig = 4; + break; + case 'U': + state = st_ucs; + ndig = 8; + break; + case 'v': + nval = 11; + break; + case 'x': + case 'X': + state = st_hex; + ndig = 2; + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + state = st_oct; + ndig = 2; /* Up to two more digits */ + nval = c - '0'; + break; + case '\0': + nval = '\\'; + p--; /* Reprocess; terminates string */ + break; + default: + nval = c; + break; + } + if (state == st_start) + EMIT(nval); + break; + + case st_oct: + if (c >= '0' && c <= '7') { + nval = (nval << 3) + (c - '0'); + if (--ndig) + break; /* Might have more digits */ + } else { + p--; /* Process this character again */ + } + EMIT(nval); + state = st_start; + break; + + case st_hex: + case st_ucs: + if (nasm_isxdigit(c)) { + nval = (nval << 4) + numvalue(c); + if (--ndig) + break; /* Might have more digits */ + } else { + p--; /* Process this character again */ + } + + if (unlikely(p <= escp)) + EMIT(escp[-1]); + else if (state == st_ucs) + EMIT_UTF8(nval); + else + EMIT(nval); + + state = st_start; + break; + + default: + panic(); + } + } + } else if (bq == '\'' || bq == '\"') { + /* + * '...' or "..." string, NASM legacy style (no escapes of + * * any kind, including collapsing double quote marks.) + * We obviously can't get here if qstart == '\"'. + */ + while ((c = *p++) && (c != bq)) + EMIT(c); + } else { + /* Not a quoted string, just return the input... */ + while ((c = *p++)) + EMIT(c); + } + + /* Zero-terminate the output */ + *q = '\0'; + + if (ctlmask & badctl) + nasm_nonfatal("control character in string not allowed here"); + + if (ep) + *ep = (char *)p - 1; + return (char *)q - str; +} +#undef EMIT + +/* + * Unquote any arbitrary string; may produce any bytes, including embedded + * control- and NUL characters. + */ +size_t nasm_unquote(char *str, char **ep) +{ + return nasm_unquote_anystr(str, ep, 0, STR_NASM); +} + +/* + * Unquote a string intended to be used as a C string; most control + * characters are rejected, including whitespace characters that + * would imply line endings and so on. + */ +size_t nasm_unquote_cstr(char *str, char **ep) +{ + return nasm_unquote_anystr(str, ep, BADCTL, STR_NASM); +} + +/* + * Find the end of a quoted string; returns the pointer to the terminating + * character (either the ending quote or the null character, if unterminated.) + * If the input is not a quoted string, return NULL. + * This applies to NASM style strings only. + */ +char *nasm_skip_string(const char *str) +{ + char bq; + const char *p; + char c; + enum unq_state { + st_start, + st_backslash, + st_done + } state; + + bq = str[0]; + p = str+1; + switch (bq) { + case '\'': + case '\"': + /* '...' or "..." string */ + while ((c = *p++) && (c != bq)) + ; + break; + + case '`': + /* `...` string */ + state = st_start; + while (state != st_done) { + c = *p++; + switch (state) { + case st_start: + switch (c) { + case '\\': + state = st_backslash; + break; + case '`': + case '\0': + state = st_done; + break; + default: + break; + } + break; + + case st_backslash: + /* + * Note: for the purpose of finding the end of the string, + * all successor states to st_backslash are functionally + * equivalent to st_start, since either a backslash or + * a backquote will force a return to the st_start state, + * and any possible multi-character state will terminate + * for any non-alphanumeric character. + */ + state = c ? st_start : st_done; + break; + + default: + panic(); + } + } + break; + + default: + /* Not a string at all... */ + return NULL; + } + return (char *)p - 1; +} |