/* replacebench.c * * A portable program for benchmarking several string replace functions. * Each replace function has the same prototype. It takes an original string, * a search token and a replace token. It then performs a replacement on the * string, leaving the original untouched and returning the post-replacement * string in a buffer of allocated memory that can be deallocated with free(). * * If sufficient memory can't be allocated, the function returns NULL. * * Compile the program using e.g: * * gcc -O2 -std=c99 -pedantic -W -Wall replacebench.c -o replacebench * gcc -O2 -std=c89 -pedantic -W -Wall replacebench.c -o replacebench * gcc -O2 -ansi -pedantic -W -Wall replacebench.c -o replacebench * (the last two are equivalent and use the older more supported C89 standard) * * If you define HAVE_MEMCCPY to 1 to enable replace_memccpy() and * replace_rare(), then omit the standard-compatible flags from the above * compilation commands, i.e. -std=c99, -std=c89 and -ansi. * * Run it using: * * replacebench * * where: * * is a text file to obtain the main string from (the file won't be * modified; it will be opened read-only) * is the string token to replace * is the string token to replace it with * is the number of times to call the replacement function * determines which function to use; currently valid values are: * cache, opt2, opt, smart, memchr2, jmucc, dull, fann95, haigh, * skar_haigh, skar_net, net, net_lenmod, memccpy, rare (the * last two only if HAVE_MEMCCPY is defined to 1) * * Compiled/authored by: Laird Shaw (formerly going by "Netocrat"). * Contact at : http://creativeandcritical.net/contact * * References this newsgroup thread: * Subject: how to replace a substring in a string using C? * Date of first post in thread: 31 October 2005 * Message-ID of first post in thread: * 1130687985.367948.20180@f14g2000cwb.googlegroups.com * Google-groups URL: * * * and this stackoverflow question: * * * * COPYRIGHT * * I release into the public domain all of the code in this file which I wrote; * no copying restrictions apply. You are free to attribute it to me (Laird Shaw * contactable @ http://creativeandcritical.net/contact) or not, as you see fit * - all that I ask is that you don't attribute it to yourself or anyone * else. To be clear, the code to which I'm referring is the functions * replace_cache(), replace_net_lenmod(), replace_net(), main(), the macros and * printusage(), and my contributions to replace_skar_net(), replace_opt() and * replace_opt2(). Albert Chan has explicitly stipulated replace_smart() as * public domain code too, and implicitly replace_memccpy(), replace_rare() and * replace_memchr2() too. All other code I presume was released into the public * domain too due to having been posted either as answers in a thread on a * public newsgroup or as an answer on a public Q&A website (stackoverflow), but * you'd have to contact the original authors to be sure. * * VERSIONING AND CHANGE HISTORY * * Current revision: 2.8 * * 10 Sep 2016: Fixed a potential realloc memory leak, as well as a gcc warning * when compiling under the C99 standard or higher. * 29 Apr 2015: Added replace_memchr2(). Updated replace_rare per Albert's * revision, which also fixes a bug I'd introduced when * converting the RESIZE macro to inline code. * Increased cache_sz_inc_factor to 3 for a performance boost in * scenarios with many matches (going beyond 3 seems to have * no or at least undetectable effect). Edited a few comments. * 24 Apr 2015: Found a resizing bug in replace_smart() which Albert then fixed. * Requested that Albert replace the RESIZE macro with inline code * seeing that it was only used once by replace_smart(), which he * kindly did. Added replace_memccpy() and replace_rare() and their * enabling #define HAVE_MEMCCPY, defaulting to undefined * (commented out). Incorporated Albert's bug fix and inlined * RESIZE macro into those functions. * Fixed a few compiler warnings in other functions, mostly by * adding parentheses around "assignments used as truth values" * (GCC's warning phrasing). * Fixed up a few comments. * 23 Apr 2015: Renamed replace_albert() to replace_smart() at Albert's * suggestion; added his email address. * 19 Apr 2015: Added replace_albert(), fixed compilation errors on * MS Visual C++ 2010 Express. * 13 Apr 2015: Converted C99-style comments into C90-style comments in * replace_dull(). * 02 Apr 2015: Added replace_dull() and replace_fann95(). * 03 Mar 2015: Added comments to replace_cache(), edited comments elsewhere. * 02 Mar 2015: Added replace_cache(). Fixed a Standard compatibility problem * with replace_opt2(): C90 forbids mixed declarations and code, * so moved the declaration of variable "l" to top of function. * 19 Feb 2015: Revised the copyright statement to remove assertion that * replace_opt() and replace_opt2() are public domain, given the * unknown licence of the function on which they are based. * 17 Feb 2015: Added to the comment above Mark F. Haigh's function that * replace_opt() and replace_opt2() are heavily based on it, and * indicated which optimisations they introduced to it. * 06 Oct 2014: Added "opt2" and "jmucc" to the description of * above. * 05 Oct 2014: Added replace_opt2 and replace_jmucc. * 11 May 2014: Added an attempt to determine filesize, and to allocate memory * based on the determined value, rather than on a fixed value. * Fixed a memory leak (by moving free(newstr) inside the loop). * Commented out the compile-time check for PTRDIFF_MAX < SIZE_MAX. * Added to the usage message a list of valid values for the * fifth argument, function_name_postfix. * 30 Aug 2009: Added copyright information. * 20 May 2006: Removed extra braces in the TESTSETFP test series in main() * Converted inconsistent tabs to spaces; removed trailing whitesp. * 03 Dec 2005: Removed incorrect portion of comment that stated ptrdiff_t was * non-existent in C90; actually it's PTRDIFF_MAX that was * non-existent. * 11 Nov 2005: Made portable to C89 after clarifying that the differences are: * a) stdint.h and therefore SIZE_MAX and PTRDIFF_MAX don't exist * b) the z specifier for printf doesn't exist * 9 Nov 2005: First written. * */ #include #include #include #include #include #include /* Uncomment if your compiler supports memccpy(), a POSIX but not a Standard C * library function, and if you want this program to support Albert's * replace_memccpy() and replace_rare() functions. */ /*#define HAVE_MEMCCPY 1*/ /* Maximum default buffer size */ #define MAXSTRSIZE 1000000 /* use 1 million bytes max */ #if (__STDC_VERSION__ >= 199901L) /* For C99 and above, include stdint.h and restrict max buffer size */ #include /* Don't allocate more than 1/3 of SIZE_MAX even if maximum default buffer * setting is larger */ #define DEFSTRSIZE (SIZE_MAX / 3 > MAXSTRSIZE ? MAXSTRSIZE : SIZE_MAX / 3) #define Z "z" /* Don't cast size_t when printing it since we use C99's %z specifier */ #define PRTF_SZ_CAST 0 + #else #define DEFSTRSIZE MAXSTRSIZE /* Use the largest integral type as a printf specifier for size_t and * cast size_t appropriately when doing so */ #define Z "l" #define PRTF_SZ_CAST (unsigned long) #endif typedef char *(*replacefp_t)(const char *, const char *, const char *); #define STR(str) #str #define TESTSETFP(fn, fn_token, fp) ((strcmp(fn, #fn_token) == 0) ? \ (fp = replace_##fn_token, STR(replace_##fn_token)) : NULL) void printusage(void) { fprintf(stderr, "This program takes 5 parameters in this order: \n" \ "filename old_token new_token number_of_iterations " \ "function_name_postfix\n" \ "It doesn't write to the file at all.\n" \ "Valid values for function_name_postfix are: cache, " \ "opt, opt2, haigh, smart, memchr2, dull, fann95, " \ "jmucc, skar_haigh, skar_net, net, net_lenmod"); #ifdef HAVE_MEMCCPY fprintf(stderr, ", memccpy, rare"); #endif fprintf(stderr, ".\n"); } /* An optimised string replacement function that caches string positions so that * strstr() doesn't need to be called twice for each position. In general, * either this function or (more typically) Albert Chan's replace_smart() * function will be fastest for any given set of inputs, however for small * strings with few replacements, both can be a little slower than some of the * other variants, in particular replace_opt2() below. * * The cache size is managed by increasing it by an increment each time it needs * increasing, with the increment increasing by a factor (of three by default) * each time. * * This code is entirely by me, and released into the public domain, so have no * doubts about your right to use it. */ char *replace_cache(const char *str, const char *from, const char *to) { /* Adjust each of the below values to suit your needs. */ /* Increment positions cache size initially by this number. */ size_t cache_sz_inc = 16; /* Thereafter, each time capacity needs to be increased, * multiply the increment by this factor. */ const size_t cache_sz_inc_factor = 3; /* But never increment capacity by more than this number. */ const size_t cache_sz_inc_max = 1048576; char *pret, *ret = NULL; const char *pstr2, *pstr = str; size_t i, count = 0; #if (__STDC_VERSION__ >= 199901L) uintptr_t *pos_cache_tmp, *pos_cache = NULL; #else ptrdiff_t *pos_cache_tmp, *pos_cache = NULL; #endif size_t cache_sz = 0; size_t cpylen, orglen, retlen, tolen, fromlen = strlen(from); /* Find all matches and cache their positions. */ while ((pstr2 = strstr(pstr, from)) != NULL) { count++; /* Increase the cache size when necessary. */ if (cache_sz < count) { cache_sz += cache_sz_inc; pos_cache_tmp = realloc(pos_cache, sizeof(*pos_cache) * cache_sz); if (pos_cache_tmp == NULL) { goto end_repl_str; } else pos_cache = pos_cache_tmp; cache_sz_inc *= cache_sz_inc_factor; if (cache_sz_inc > cache_sz_inc_max) { cache_sz_inc = cache_sz_inc_max; } } pos_cache[count-1] = pstr2 - str; pstr = pstr2 + fromlen; } orglen = pstr - str + strlen(pstr); /* Allocate memory for the post-replacement string. */ if (count > 0) { tolen = strlen(to); retlen = orglen + (tolen - fromlen) * count; } else retlen = orglen; ret = malloc(retlen + 1); if (ret == NULL) { goto end_repl_str; } if (count == 0) { /* If no matches, then just duplicate the string. */ strcpy(ret, str); } else { /* Otherwise, duplicate the string whilst performing * the replacements using the position cache. */ pret = ret; memcpy(pret, str, pos_cache[0]); pret += pos_cache[0]; for (i = 0; i < count; i++) { memcpy(pret, to, tolen); pret += tolen; pstr = str + pos_cache[i] + fromlen; cpylen = (i == count-1 ? orglen : pos_cache[i+1]) - pos_cache[i] - fromlen; memcpy(pret, pstr, cpylen); pret += cpylen; } ret[retlen] = '\0'; } end_repl_str: /* Free the cache and return the post-replacement string, * which will be NULL in the event of an error. */ free(pos_cache); return ret; } /* The most optimised portable version of replace without caching, other than * Albert Chan's function. * * New feature over the original replace_opt(): an optimisation to avoid * the final call to strstr (and thus avoid unnecessary iteration over the * string) when the strings "old" and "new" are different lengths. Inspired * by jmucchiello's function (see replace_jmucc() below). The speed difference * can be significant when the final match is far from the end. * * This function is based on Mark F. Haigh's function below, so given that my * contributions to it are public domain, its overall copyright is no stricter * than whatever copyright Mark intended for his function. */ char *replace_opt2(const char *str, const char *old, const char *new) { char *ret, *r; const char *p, *q; size_t oldlen = strlen(old); size_t count, retlen, newlen = strlen(new); int samesize = (oldlen == newlen); ptrdiff_t l; if (!samesize) { for (count = 0, p = str; (q = strstr(p, old)) != NULL; p = q + oldlen) count++; /* This is undefined if p - str > PTRDIFF_MAX */ retlen = p - str + strlen(p) + count * (newlen - oldlen); } else retlen = strlen(str); if ((ret = malloc(retlen + 1)) == NULL) return NULL; r = ret, p = str; while (1) { /* If the old and new strings are different lengths - in other * words we have already iterated through with strstr above, * and thus we know how many times we need to call it - then we * can avoid the final (potentially lengthy) call to strstr, * which we already know is going to return NULL, by * decrementing and checking count. */ if (!samesize && !count--) break; /* Otherwise i.e. when the old and new strings are the same * length, and we don't know how many times to call strstr, * we must check for a NULL return here (we check it in any * event, to avoid further conditions, and because there's * no harm done with the check even when the old and new * strings are different lengths). */ if ((q = strstr(p, old)) == NULL) break; /* This is undefined if q - p > PTRDIFF_MAX */ l = q - p; memcpy(r, p, l); r += l; memcpy(r, new, newlen); r += newlen; p = q + oldlen; } strcpy(r, p); return ret; } /* The original version of the most optimised portable version of replace * that I've come up with. * * See above warning and code comments describing the possibility of UB * * This function is based on Mark F. Haigh's function below, so given that my * contributions to it are public domain, its overall copyright is no stricter * than whatever copyright Mark intended for his function. */ char *replace_opt(const char *str, const char *old, const char *new) { char *ret, *r; const char *p, *q; size_t oldlen = strlen(old); size_t count, retlen, newlen = strlen(new); if (oldlen != newlen) { for (count = 0, p = str; (q = strstr(p, old)) != NULL; p = q + oldlen) count++; /* this is undefined if p - str > PTRDIFF_MAX */ retlen = p - str + strlen(p) + count * (newlen - oldlen); } else retlen = strlen(str); if ((ret = malloc(retlen + 1)) == NULL) return NULL; for (r = ret, p = str; (q = strstr(p, old)) != NULL; p = q + oldlen) { /* this is undefined if q - p > PTRDIFF_MAX */ ptrdiff_t l = q - p; memcpy(r, p, l); r += l; memcpy(r, new, newlen); r += newlen; } strcpy(r, p); return ret; } /* A realloc-based function by Albert Chan , sent to me * privately, and reproduced here with Albert's permission as public domain * code, with my only edit being to qualify the parameters as const. * * This function benchmarks faster than all of the others under many conditions, * typically beating its nearest competitor, replace_cache(), by around 10-20%. * replace_cache() is, though, still fastest under some conditions, in * particular some scenarios involving longer strings with few matches. */ char* replace_smart (const char *str, const char *sub, const char *rep) { size_t slen = strlen(sub); size_t rlen = strlen(rep); size_t size = strlen(str) + 1; size_t diff = rlen - slen; size_t capacity = (diff>0 && slen) ? 2 * size : size; char *buf = malloc(capacity); char *find, *b = buf; if (b == NULL) return NULL; if (slen == 0) return memcpy(b, str, size); while((find = strstr(str, sub))) { if ((size += diff) > capacity) { char *ptr = realloc(buf, capacity = 2 * size); if (ptr == NULL) {free(buf); return NULL;} b = ptr + (b - buf); buf = ptr; } memcpy(b, str, find - str); /* copy up to occurrence */ b += find - str; memcpy(b, rep, rlen); /* add replacement */ b += rlen; str = find + slen; } memcpy(b, str, size - (b - buf)); b = realloc(buf, size); /* trim to size */ return b ? b : buf; } /* Helper variable+function by Albert Chan for replace_rare() and * replace_memchr2(). * As far as I understand, Albert releases this code into the public domain as * he does for replace_smart(). */ /* rarity based on newlisp v.10.4.3 manual text */ /* ascii code 32 to 126, offset = 32 */ static char freq[] = "~HA43S)$P?DF" "N%@OWE+;5,*897\"6 {iqp}lenu0VsmytrBzx|k^cad=&('#"; size_t rare_idx(const char *sub) { size_t i, min_idx=0, min_val=1000; for(i=0 ; sub[i]; i++) { unsigned idx = sub[i] - 32; /* setup for table lookup */ if (idx > 126-32) return i; /* non-print char, very rare */ if ((unsigned char)freq[idx] >= min_val) continue; min_val = freq[idx]; /* look for rare char */ min_idx = i; } return min_idx; } #ifdef HAVE_MEMCCPY /* Albert Chan's memccpy()-based function. Because memccpy() is not part of the * C Standard (it is instead part of the POSIX standard), whereas this file's * aim is portability, you will need to explicitly enable this function if you * want access to it, by uncommenting the #define of HAVE_MEMCCPY further above. * I've made a few changes to this function, mostly with Albert's knowledge. * * Sent to me privately. As far as I understand, Albert releases this code into * the public domain as he does for replace_smart(). */ char* replace_memccpy (const char *str, const char *sub, const char *rep) { size_t slen = strlen(sub); size_t rlen = strlen(rep); size_t size = strlen(str) + 1; size_t diff = rlen - slen; size_t capacity = (diff>0 && slen) ? 2 * size : size; char *b, *buf, ch, *find; const char *end, *s; buf = b = (char *) malloc(capacity); if (b == NULL) return NULL; if (slen == 0) return memcpy(b, str, size); end = str + size; /* end of string (past NULL) */ ch = *sub++; /* first char of sub */ slen--; /* remove ch from sub */ while((find = memccpy(b, str, ch, end - str))) { s = str += find - b; b = find; if (memcmp(sub, s, slen) == 0) { /* found substring */ if ((size += diff) > capacity) { char *ptr = realloc(buf, capacity = 2 * size); if (ptr == NULL) {free(buf); return NULL;} b = ptr + (b - buf); buf = ptr; } memcpy(--b, rep, rlen); /* replace (remove ch) */ b += rlen; str += slen; } } b = realloc(buf, size); /* trim to size */ return b ? b : buf; } /* Albert Chan's revised memccpy()-based function, this time based on the * principle that performance is most improved when the character searched for * by memccpy() is rare. I've made a few changes to it, mostly with Albert's * knowledge. As with replace_memccpy(), you will have to explicitly enable this * function if you want access to it by uncommenting the #define of HAVE_MEMCCPY * further above. * * Sent to me privately. As far as I understand, Albert releases this code into * the public domain as he does for replace_smart(). */ char* replace_rare (const char *str, const char *sub, const char *rep) { size_t slen = strlen(sub); size_t rlen = strlen(rep); size_t size = strlen(str) + 1; size_t i, n, diff = rlen - slen; size_t capacity = (diff>0 && slen) ? 2 * size : size; char *b, *buf, *find, *end, *s, *s1, *s2, ch; buf = b = (char *) malloc(capacity); if (b == NULL) return NULL; if (slen == 0) return memcpy(b, str, size); i = rare_idx(sub); ch = sub[i]; /* pick a rare char for memccpy */ s = (char *) str; /* start of string */ end = s + size; /* end of string (past NULL) */ for(n=i++; n; n--) *b++ = *s++; /* i bytes to b */ while((find = memccpy(b, s, ch, end - s))) { s += find - b; /* add bytes copied */ b = find; s1 = s - i; s2 = (char *) sub; while(*s2 && (*s1 == *s2)) {s1++; s2++;} if (*s2) continue; /* no match */ b += rlen - i; /* substring found */ s += slen - i; if ((size += diff) > capacity) { char *ptr = realloc(buf, capacity = 2 * size); if (ptr == NULL) {free(buf); return NULL;} b = ptr + (unsigned) (b - buf); buf = ptr; } memcpy(b - rlen, rep, rlen); /* replace */ } b = realloc(buf, size); /* trim to size */ return b ? b : buf; } #endif /* Albert Chan's memchr()-based function using rare characters to speed * matching, also with secondary-character matching. For some strings, * this is the most performant variant. * * I've made a few changes, including: * - const-qualified parameters and some variables * - converted `int`s to `size_t`s * - moved a few variable declarations to the top. */ char* replace_memchr2 (const char *str, const char *sub, const char *rep) { size_t slen = strlen(sub); size_t rlen = strlen(rep); size_t size = strlen(str) + 1; size_t i, j, gap, diff = rlen - slen; size_t capacity = (diff>0 && slen) ? 2 * size : size; char *b, *buf, ch1, ch2; const char *s, *end, *s1, *s2; buf = b = (char *) malloc(capacity); if (b == NULL) return NULL; if (slen == 0 || slen >= size) return memcpy(b, str, size); i = rare_idx(sub); /* ch1 location */ j = !i; /* ch2 location, ch1 != ch2 */ ch1 = sub[i]; /* search the rare char */ ch2 = sub[j]; /* match a 2nd sub char */ sub += j + 1; /* match sub[j+1] ... */ j -= i; /* j bias at i */ s = str + i; /* starting search */ end = s + size - slen; /* ending search */ while((s = memchr(s, ch1, end - s))) { if (slen > 1) { s1 = s + j; /* match ch2 */ if (*s1 != ch2) {s++; continue;} if (slen > 2) { /* match everything else */ s1++; /* skip ch2 */ s2 = sub; while(*s2 && (*s1 == *s2)) {s1++; s2++;} if (*s2) {s++; continue;} } } if ((size += diff) > capacity){ char *ptr = realloc(buf, capacity = 2 * size); if (ptr == NULL) {free(buf); return NULL;} b = ptr + (b - buf); buf = ptr; } gap = s - i - str; /* gap between str and sub */ memcpy(b, str, gap); /* copied to buffer */ b += gap; memcpy(b, rep, rlen); /* add replacement */ b += rlen; s += slen; /* skip ahead */ str = s - i; if (end - s < 0) break; /* done */ } memcpy(b, str, size - (b - buf)); b = realloc(buf, size); /* trim to size */ return b ? b : buf; } /* jmucchiello's function from http://stackoverflow.com/a/779960 * at 5 October 2014, with function prototype modified so all parameters are * const-qualified, with parameters modified to be const-qualified, with * parentheses added around tmp = strstr(ins, rep) to suppress a GCC warning, * and with comments converted from C99-style to C89-style for maximum * portability. * * The main drawback of this function (aside from lack of a position cache) is * that it misses the optimisation of iterating through only once with strstr() * when "rep" and "with" are the same length - in that case, we know that the * new memory requirements will simply be the length of the string, so there's * no need to iterate through initially with strstr() to set "count". */ char *replace_jmucc(const char *orig, const char *rep, const char *with) { char *result; /* the return string */ const char *ins; /* the next insert point */ char *tmp; /* varies */ int len_rep; /* length of rep */ int len_with; /* length of with */ int len_front; /* distance between rep and end of last rep */ int count; /* number of replacements */ if (!orig) return NULL; if (!rep) rep = ""; len_rep = strlen(rep); if (!with) with = ""; len_with = strlen(with); ins = orig; for (count = 0; (tmp = strstr(ins, rep)); ++count) { ins = tmp + len_rep; } /* first time through the loop, all the variable are set correctly * from here on, * tmp points to the end of the result string * ins points to the next occurrence of rep in orig * orig points to the remainder of orig after "end of rep" */ tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1); if (!result) return NULL; while (count--) { ins = strstr(orig, rep); /* This is undefined if ins - orig > PTRDIFF_MAX * (comment added by Laird) */ len_front = ins - orig; tmp = strncpy(tmp, orig, len_front) + len_front; tmp = strcpy(tmp, with) + len_with; orig += len_front + len_rep; /* move to next "end of rep" */ } strcpy(tmp, orig); return result; } /* yogo1212's function from * http://stackoverflow.com/a/28951870/3126722 * as at 2 April 2015, with C99-style comments (double slash) replaced with * C90-style comments (slash-asterisk) for better portability, and with * parentheses added around needle = strstr(in, pattern) to prevent a GCC warning. * * This code is both minimal and a clever way of avoiding calling strstr() twice * for each match; instead, realloc() is called. Performance seems to be very * good - not quite as good overall as replace_cache(), but sometimes beating * replace_opt2(), which until now I had thought was unbeatably the most * performant variant without caching. * * This function inspired Albert's replace_smart() variant, which has the * strongest performance. */ static char *replace_dull(const char *in, const char *pattern, const char *by) { size_t outsize = strlen(in) + 1; /* TODO maybe avoid reallocing by counting the non-overlapping occurences of pattern */ char *res = malloc(outsize); /* use this to iterate over the output */ size_t resoffset = 0; char *needle; while ((needle = strstr(in, pattern))) { /* copy everything up to the pattern */ memcpy(res + resoffset, in, needle - in); resoffset += needle - in; /* skip the pattern in the input-string */ in = needle + strlen(pattern); /* adjust space for replacement */ outsize = outsize - strlen(pattern) + strlen(by); res = realloc(res, outsize); /* copy the pattern */ memcpy(res + resoffset, by, strlen(by)); resoffset += strlen(by); } /* copy the remaining input */ strcpy(res + resoffset, in); return res; } /* fann95's function from http://stackoverflow.com/a/27605769 at 2 April 2015, * except with const qualifers added to function parameters. * Performance is not so good (in fact it's pretty awful) compared to * yogo1212's function similarly leveraging realloc() due (I think) to * unnecessary use of strcat(), and probably also due to its use of strcmp() * rather than strstr(). */ char *replace_fann95(const char *instring, const char *old, const char *new) { /* Commented out by Laird because this causes compilation errors on * Microsoft Visual C++ 2010 Express (should not precede variable declarations). */ /* if(!instring || !old || !new){ return (char*)NULL; } */ size_t instring_size=strlen(instring); size_t new_size=strlen(new); size_t old_size=strlen(old); size_t outstring_size=instring_size + 1; char *outstring; char *test; size_t i; /* Moved from below by Laird to avoid compilation errors, * and converted from int to size_t to avoid a compiler warning. */ test=(char*)malloc(old_size+1); outstring =(char*) malloc(outstring_size); if(!outstring || !test){ return (char*)NULL; } if(instring_size PTRDIFF_MAX */ count = q - p; memcpy(r, p, count); r += count; strcpy(r, new); r += len_new; } strcpy(r, p); return ret; } /* Skarmander's replacement code snippet incorporated into Mark F. Haigh's * complete function, from * * https://groups.google.com/forum/#!msg/comp.lang.c/sgydS2lDgxc/P-d9OKfle40J * * and * * https://groups.google.com/d/msg/comp.lang.c/sgydS2lDgxc/sgeiP9Go_4IJ * * Skarmander's algorithm is identical to Mark's, even though the code is * slightly different (also he uses memcpy where Mark has used strcpy in the * loop). * * His code likewise has potential for UB which I've commented. */ char *replace_skar_haigh(const char *s, const char *old, const char *new) { char *ret, *sr; const char *p; size_t len_str = strlen(s); size_t len_old = strlen(old); size_t len_new = strlen(new); size_t count; const char* t; for(count = 0, p = s; (p = strstr(p, old)); p += len_old) count++; ret = malloc(count * (len_new - len_old) + len_str + 1); if(!ret) return NULL; sr = ret; while ((t = strstr(s, old)) != NULL) { /* Copy non-matching prefix */ /* this is undefined if q - p > PTRDIFF_MAX */ ptrdiff_t l = t - s; memcpy(sr, s, l); sr += l; s += l; /* Copy replacement text */ memcpy(sr, new, len_new); sr += len_new; s += len_old; } /* Copy non-matching tail */ strcpy(sr, s); return ret; } /* Skarmander's replacement code snippet incorporated into my original * bug-fixed function. * * Skarmander's code has potential for UB which I've commented. */ char *replace_skar_net(const char *s, const char *old, const char *new) { char *ret, *sr; size_t i, count = 0; size_t newlen = strlen(new); size_t oldlen = strlen(old); const char* t; if (newlen != oldlen) { for (i = 0; s[i] != '\0'; ) { if (memcmp(&s[i], old, oldlen) == 0) count++, i += oldlen; else i++; } } else i = strlen(s); ret = malloc(i + 1 + count * (newlen - oldlen)); if (ret == NULL) return NULL; sr = ret; while ((t = strstr(s, old)) != NULL) { /* Copy non-matching prefix */ /* this is undefined if t - s > PTRDIFF_MAX */ ptrdiff_t l = t - s; memcpy(sr, s, l); sr += l; s += l; /* Copy replacement text */ memcpy(sr, new, newlen); sr += newlen; s += oldlen; } /* Copy non-matching tail */ strcpy(sr, s); return ret; } /* A modified version of my bug-fixed original using the alternative method of * strstr() rather than memcmp() to calculate the new string's buffer size, to * see what performance difference this has. * (Potentially invokes UB when strlen(old) > PTRDIFF_MAX as with the other * functions using similar code) * */ /* Preconditions: * s, old and new are valid non-NULL pointers * strlen(old) >= 1 * strlen(old) < PTRDIFF_MAX or the caller is otherwise certain that the * difference between the end of the string and the end of the last matching * token is less than PTRDIFF_MAX */ char *replace_net_lenmod(const char *s, const char *old, const char *new) { char *ret, *sr; const char *p; const char *last = s; size_t orglen, count = 0; size_t newlen = strlen(new); size_t oldlen = strlen(old); for (p = s; (p = strstr(p, old)) != NULL; count++) { p += oldlen; last = p; } /* this is undefined if last - s > PTRDIFF_MAX */ orglen = last - s + strlen(last); if ((ret = malloc(orglen + 1 + count * (newlen - oldlen))) == NULL) return NULL; sr = ret; while (*s) { if (memcmp(s, old, oldlen) == 0) { memcpy(sr, new, newlen); sr += newlen; s += oldlen; } else *sr++ = *s++; } *sr = '\0'; return ret; } /* My bug-fixed original function. * * As was pointed out by both Mark and Skarmander, its excessive use of byte * copying and calls to memcmp render it unnecessarily slow on most * implementations. * * Benchmarking under Linux using gcc shows that it is indeed around 15 times * slower than the alternatives using strstr. * * Mark also pointed out that the indexing would slow things down. I used this * approach to be able to calculate the string length without calling * strlen and without subtracting pointers: the latter is capable of producing * undefined behaviour when p2-p1 > PTRDIFF_MAX. Mark is right that the indexing * slows things down a lot so it's likely that avoiding a call to strlen by * using indexing is counterproductive; the potential for undefined behaviour * when using the alternative approach of subtracting pointers can be * documented so that callers of the function must ensure that they don't * call it with strings capable of causing UB. Such a function is implemented * at the top of the file as replace_opt. */ /* Preconditions: * s, old and new are valid non-NULL pointers * strlen(old) >= 1 */ char *replace_net(const char *s, const char *old, const char *new) { char *ret, *sr; size_t i, count = 0; size_t newlen = strlen(new); size_t oldlen = strlen(old); if (newlen != oldlen) { for (i = 0; s[i] != '\0'; ) { /* The call to memcmp on nearly every character is highly * suboptimal */ if (memcmp(&s[i], old, oldlen) == 0) count++, i += oldlen; else i++; } } else i = strlen(s); ret = malloc(i + 1 + count * (newlen - oldlen)); if (ret == NULL) return NULL; sr = ret; while (*s) { /* The call to memcmp on nearly every character is highly * suboptimal */ if (memcmp(s, old, oldlen) == 0) { memcpy(sr, new, newlen); sr += newlen; s += oldlen; } else *sr++ = *s++; } *sr = '\0'; return ret; } int main(int argc, char **argv) { FILE *f; char *converrs, *mystr = NULL; char *newstr = NULL; const char *filename, *old, *new, *fn, *setfn = NULL; size_t r; long i; long int filelen = DEFSTRSIZE; replacefp_t replacefp = NULL; clock_t start, end; if (argc <= 5) { fprintf(stderr, "Not enough parameters.\n"); printusage(); exit(EXIT_FAILURE); } /* Store command-line parameters */ filename = argv[1]; old = argv[2]; new = argv[3]; i = strtol(argv[4], &converrs, 10); if ((*converrs != '\0' && argv[4][0] != '\0') || i <= 0) { fprintf(stderr, "Unsuccessful conversion of iterations parameter.\n" \ "It must be a positive integer less than %ld.\n", LONG_MAX); printusage(); exit(EXIT_FAILURE); } fn = argv[5]; /* Set function pointer to the specified replace function */ if (((setfn = TESTSETFP(fn, cache, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, opt2, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, opt, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, haigh, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, opt, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, skar_haigh, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, skar_net, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, net, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, net_lenmod, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, dull, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, fann95, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, jmucc, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, memchr2, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, smart, replacefp)) == NULL) #ifdef HAVE_MEMCCPY && ((setfn = TESTSETFP(fn, memccpy, replacefp)) == NULL) && ((setfn = TESTSETFP(fn, rare, replacefp)) == NULL) #endif ) { fprintf(stderr, "replace_%s is not recognised as a valid function.\n", fn); printusage(); exit(EXIT_FAILURE); } /* Try to determine the specified file's size so we know how much memory to * malloc. Note firstly that this requires opening the file in binary mode, * and secondly that at least the C89 version of the Standard does not * require that a binary stream supports seeking to the end of a file. * If the seek and tell fail, set the file size to DEFSTRSIZE. */ f = fopen(filename, "rb"); if (f == NULL) { perror("fopen"); exit(EXIT_FAILURE); } if (fseek(f, 0, SEEK_END) > 0 || (filelen = ftell(f)) == -1L) { filelen = DEFSTRSIZE; } /* Allocate enough memory to read in the specified file up to the size determined above. */ if ((mystr = malloc(filelen + 1)) == NULL) { perror("malloc (for file)"); exit(EXIT_FAILURE); } /* Reopen the specified file in text mode. */ if (freopen(filename, "r", f) == NULL) { perror("freopen"); exit(EXIT_FAILURE); } /* Read in as much of specified file as possible */ if ((r = fread(mystr, 1, filelen, f)) == 0) { perror("fread"); exit(EXIT_FAILURE); } mystr[r] = '\0'; if (fclose(f) != 0) perror("fclose"); /* Print information prior to calling the function */ fprintf(stderr, "Filename : %s\nIterations : %lu\nOld : %s\n" \ "New : %s\nFunction : %s\nPre-length : %"Z"u\n", filename, i, old, new, setfn, PRTF_SZ_CAST strlen(mystr)); /* Start timing and call the replace function the number of times * specified */ start = clock(); while (i-- > 0) { free(newstr); if ((newstr = replacefp(mystr, old, new)) == NULL) fprintf(stderr, "Replace returned NULL.\n"); } /* End timing and print post-call information */ end = clock(); if (newstr != NULL) { fprintf(stderr, "Post-length: %"Z"u\nDuration : %f seconds\n", PRTF_SZ_CAST strlen(newstr), (end - start) / (double) CLOCKS_PER_SEC); } /* Free allocated memory and exit with successful result */ free(newstr); free(mystr); return EXIT_SUCCESS; }