ares_strsplit() rewrite as wrapper around ares__buf_split()

We want to limit as much as possible any hand written parsers.
ares__buf_split() uses the new memory-safe parsing routines.  This
adds a couple of additional flags to remove duplicates which the
existing split code did.

Fix By: Brad House (@bradh352)
pull/673/head
Brad House 1 year ago
parent c6708cffb8
commit 88c444d495
  1. 62
      src/lib/ares__buf.c
  2. 6
      src/lib/ares__buf.h
  3. 26
      src/lib/ares__htable.c
  4. 42
      src/lib/ares_str.c
  5. 4
      src/lib/ares_str.h
  6. 101
      src/lib/ares_strsplit.c
  7. 2
      src/lib/ares_strsplit.h

@ -793,6 +793,34 @@ static void ares__buf_destroy_cb(void *arg)
ares__buf_destroy(arg); ares__buf_destroy(arg);
} }
static ares_bool_t ares__buf_split_isduplicate(ares__llist_t *list,
const unsigned char *val,
size_t len,
ares__buf_split_t flags)
{
ares__llist_node_t *node;
for (node = ares__llist_node_first(list); node != NULL;
node = ares__llist_node_next(node)) {
ares__buf_t *buf = ares__llist_node_val(node);
size_t plen = 0;
const unsigned char *ptr = ares__buf_peek(buf, &plen);
/* Can't be duplicate if lengths mismatch */
if (plen != len)
continue;
if (flags & ARES_BUF_SPLIT_CASE_INSENSITIVE) {
if (ares__memeq_ci(ptr, val, len))
return ARES_TRUE;
} else {
if (memcmp(ptr, val, len) == 0)
return ARES_TRUE;
}
}
return ARES_FALSE;
}
ares_status_t ares__buf_split(ares__buf_t *buf, const unsigned char *delims, ares_status_t ares__buf_split(ares__buf_t *buf, const unsigned char *delims,
size_t delims_len, ares__buf_split_t flags, size_t delims_len, ares__buf_split_t flags,
ares__llist_t **list) ares__llist_t **list)
@ -826,23 +854,27 @@ ares_status_t ares__buf_split(ares__buf_t *buf, const unsigned char *delims,
const unsigned char *ptr = ares__buf_tag_fetch(buf, &len); const unsigned char *ptr = ares__buf_tag_fetch(buf, &len);
ares__buf_t *data; ares__buf_t *data;
/* Since we don't allow const buffers of 0 length, and user wants 0-length if (!(flags & ARES_BUF_SPLIT_NO_DUPLICATES) ||
* buffers, swap what we do here */ !ares__buf_split_isduplicate(*list, ptr, len, flags)) {
if (len) {
data = ares__buf_create_const(ptr, len);
} else {
data = ares__buf_create();
}
if (data == NULL) { /* Since we don't allow const buffers of 0 length, and user wants
status = ARES_ENOMEM; * 0-length buffers, swap what we do here */
goto done; if (len) {
} data = ares__buf_create_const(ptr, len);
} else {
data = ares__buf_create();
}
if (ares__llist_insert_last(*list, data) == NULL) { if (data == NULL) {
ares__buf_destroy(data); status = ARES_ENOMEM;
status = ARES_ENOMEM; goto done;
goto done; }
if (ares__llist_insert_last(*list, data) == NULL) {
ares__buf_destroy(data);
status = ARES_ENOMEM;
goto done;
}
} }
} }

@ -420,7 +420,11 @@ typedef enum {
/*! Allow blank sections, by default blank sections are not emitted. If using /*! Allow blank sections, by default blank sections are not emitted. If using
* ARES_BUF_SPLIT_DONT_CONSUME_DELIMS, the delimiter is not counted as part * ARES_BUF_SPLIT_DONT_CONSUME_DELIMS, the delimiter is not counted as part
* of the section */ * of the section */
ARES_BUF_SPLIT_ALLOW_BLANK = 1 << 1 ARES_BUF_SPLIT_ALLOW_BLANK = 1 << 1,
/*! Remove duplicate entries */
ARES_BUF_SPLIT_NO_DUPLICATES = 1 << 2,
/*! Perform case-insenstive matching when comparing values */
ARES_BUF_SPLIT_CASE_INSENSITIVE = 1 << 3
} ares__buf_split_t; } ares__buf_split_t;
/*! Split the provided buffer into multiple sub-buffers stored in the variable /*! Split the provided buffer into multiple sub-buffers stored in the variable

@ -399,30 +399,6 @@ unsigned int ares__htable_hash_FNV1a(const unsigned char *key, size_t key_len,
return hv; return hv;
} }
/* tolower() is locale-specific. Use a lookup table fast conversion that only
* operates on ASCII */
static const unsigned char ares__tolower_lookup[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33,
0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40,
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A,
0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81,
0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8,
0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5,
0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2,
0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC,
0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
};
/* Case insensitive version, meant for ASCII strings */ /* Case insensitive version, meant for ASCII strings */
unsigned int ares__htable_hash_FNV1a_casecmp(const unsigned char *key, unsigned int ares__htable_hash_FNV1a_casecmp(const unsigned char *key,
@ -433,7 +409,7 @@ unsigned int ares__htable_hash_FNV1a_casecmp(const unsigned char *key,
size_t i; size_t i;
for (i = 0; i < key_len; i++) { for (i = 0; i < key_len; i++) {
hv ^= (unsigned int)ares__tolower_lookup[key[i]]; hv ^= (unsigned int)ares__tolower(key[i]);
/* hv *= 0x01000193 */ /* hv *= 0x01000193 */
hv += (hv << 1) + (hv << 4) + (hv << 7) + (hv << 8) + (hv << 24); hv += (hv << 1) + (hv << 4) + (hv << 7) + (hv << 8) + (hv << 24);
} }

@ -109,3 +109,45 @@ ares_bool_t ares_str_isnum(const char *str)
} }
return ARES_TRUE; return ARES_TRUE;
} }
/* tolower() is locale-specific. Use a lookup table fast conversion that only
* operates on ASCII */
static const unsigned char ares__tolower_lookup[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33,
0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40,
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D,
0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A,
0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81,
0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8,
0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5,
0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2,
0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC,
0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6,
0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
};
unsigned char ares__tolower(unsigned char c)
{
return ares__tolower_lookup[c];
}
ares_bool_t ares__memeq_ci(const unsigned char *ptr, const unsigned char *val,
size_t len)
{
size_t i;
for (i=0; i<len; i++) {
if (ares__tolower_lookup[ptr[i]] != ares__tolower_lookup[val[i]]) {
return ARES_FALSE;
}
}
return ARES_TRUE;
}

@ -48,4 +48,8 @@ size_t ares_strcpy(char *dest, const char *src, size_t dest_size);
ares_bool_t ares_str_isnum(const char *str); ares_bool_t ares_str_isnum(const char *str);
unsigned char ares__tolower(unsigned char c);
ares_bool_t ares__memeq_ci(const unsigned char *ptr, const unsigned char *val,
size_t len);
#endif /* HEADER_CARES_STRDUP_H */ #endif /* HEADER_CARES_STRDUP_H */

@ -71,15 +71,16 @@ char **ares__strsplit_duplicate(char **elms, size_t num_elm)
return out; return out;
} }
char **ares__strsplit(const char *in, const char *delms, size_t *num_elm) char **ares__strsplit(const char *in, const char *delms, size_t *num_elm)
{ {
const char *p; ares_status_t status;
char **table; ares__buf_t *buf = NULL;
void *tmp; ares__llist_t *llist = NULL;
size_t i; ares__llist_node_t *node;
size_t j; char **out = NULL;
size_t k; size_t cnt = 0;
size_t count; size_t idx = 0;
if (in == NULL || delms == NULL || num_elm == NULL) { if (in == NULL || delms == NULL || num_elm == NULL) {
return NULL; return NULL;
@ -87,56 +88,56 @@ char **ares__strsplit(const char *in, const char *delms, size_t *num_elm)
*num_elm = 0; *num_elm = 0;
/* count non-empty delimited substrings */ buf = ares__buf_create_const((const unsigned char *)in, ares_strlen(in));
count = 0; if (buf == NULL) {
p = in;
do {
i = strcspn(p, delms);
if (i != 0) {
/* string is non-empty */
count++;
p += i;
}
} while (*p++ != 0);
if (count == 0) {
return NULL; return NULL;
} }
table = ares_malloc(count * sizeof(*table));
if (table == NULL) { status = ares__buf_split(buf, (const unsigned char *)delms,
return NULL; ares_strlen(delms),
ARES_BUF_SPLIT_NO_DUPLICATES|
ARES_BUF_SPLIT_CASE_INSENSITIVE,
&llist);
if (status != ARES_SUCCESS) {
goto done;
}
cnt = ares__llist_len(llist);
if (cnt == 0) {
status = ARES_EFORMERR;
goto done;
} }
j = 0; /* current table entry */
/* re-calculate indices and allocate new strings for table */ out = ares_malloc_zero(cnt * sizeof(*out));
for (p = in; j < count; p += i + 1) { if (out == NULL) {
i = strcspn(p, delms); status = ARES_ENOMEM;
if (i != 0) { goto done;
for (k = 0; k < j; k++) { }
if (strncasecmp(table[k], p, i) == 0 && table[k][i] == 0) {
break; for (node = ares__llist_node_first(llist); node != NULL;
} node = ares__llist_node_next(node)) {
} ares__buf_t *val = ares__llist_node_val(node);
if (k == j) { char *temp = NULL;
/* copy unique strings only */
table[j] = ares_malloc(i + 1); status = ares__buf_fetch_str_dup(val, ares__buf_len(val), &temp);
if (table[j] == NULL) { if (status != ARES_SUCCESS) {
ares__strsplit_free(table, j); goto done;
return NULL;
}
ares_strcpy(table[j], p, i + 1);
j++;
} else {
count--;
}
} }
out[idx++] = temp;
} }
tmp = ares_realloc(table, count * sizeof(*table)); *num_elm = cnt;
if (tmp != NULL) { status = ARES_SUCCESS;
table = tmp;
done:
ares__llist_destroy(llist);
ares__buf_destroy(buf);
if (status != ARES_SUCCESS) {
ares__strsplit_free(out, cnt);
out = NULL;
} }
*num_elm = count; return out;
return table;
} }

@ -35,7 +35,7 @@
* Each character in the string is a delimiter so * Each character in the string is a delimiter so
* there can be multiple delimiters to split on. * there can be multiple delimiters to split on.
* E.g. ", " will split on all comma's and spaces. * E.g. ", " will split on all comma's and spaces.
* Duplicate entries are removed. * Duplicate (case-insensitive) entries are removed.
* param num_elm Return parameter of the number of elements * param num_elm Return parameter of the number of elements
* in the result array. * in the result array.
* *

Loading…
Cancel
Save