/* wreplacebench.c * * A portable program for benchmarking a wide string replace function. * The function takes an original string, search token and a replace token. * It then performs a replacement on the string, leaving the original * untouched and returning the post-replacement string in a buffer of * allocated memory that can be deallocated with free(). * * If sufficient memory can't be allocated, the function returns NULL. * * Compile the program using e.g: * * gcc -O2 -std=c99 -pedantic -W -Wall wreplacebench.c -o wreplacebench * gcc -O2 -std=c89 -pedantic -W -Wall wreplacebench.c -o wreplacebench * gcc -O2 -ansi -pedantic -W -Wall wreplacebench.c -o wreplacebench * (the last two are equivalent and use the older more supported C89 standard) * * Run it using: * * wreplacebench * * where: * * is a text file to obtain the main string from (the file won't be * modified; it will be opened read-only). This file's encoding will * be treated according to the locale in which the program is run. * In a Linux/UNIX environment, this can be controlled through the * environment variables LC_CTYPE and LC_ALL (the latter overrides * the former). To use UTF-8 encoding, ensure that the relevant * variable includes UTF-8, e.g. "en_AU.UTF-8" (where AU stands for * "Australia"). * is the string token to replace. * is the string token to replace it with. * is the number of times to call the replacement function. * * Compiled/authored by: Laird Shaw * Contact at : http://creativeandcritical.net/contact * * COPYRIGHT * * I release into the public domain all of the code in this file; no copying * restrictions apply. You are free to attribute it to me (Laird Shaw * contactable @ http://creativeandcritical.net/contact) or not, as you see fit * - all that I ask is that you don't attribute it to yourself or anyone else. * * VERSIONING AND CHANGE HISTORY * * Current revision: 1.2 * * 10 Sep 2016: Fixed a potential realloc memory leak, as well as a gcc warning * when compiling under the C99 standard or higher. * 21 Apr 2015: Added comments about locale, reformatted whitespace. * 13 Apr 2015: Initial version. * */ #include #include #include #include #include #include #include #include /* Maximum default buffer size */ #define MAXSTRSIZE 1000000 /* use 1 million bytes max */ #if (__STDC_VERSION__ >= 199901L) /* For C99 and above, include stdint.h and restrict max buffer size */ #include /* Don't allocate more than 1/3 of SIZE_MAX even if maximum default buffer * setting is larger */ #define DEFSTRSIZE (SIZE_MAX / 3 > MAXSTRSIZE ? MAXSTRSIZE : SIZE_MAX / 3) #define Z "z" /* Don't cast size_t when printing it since we use C99's %z specifier */ #define PRTF_SZ_CAST 0 + #else #define DEFSTRSIZE MAXSTRSIZE /* Use the largest integral type as a printf specifier for size_t and * cast size_t appropriately when doing so */ #define Z "l" #define PRTF_SZ_CAST (unsigned long) #endif #define printusage fprintf(stderr, \ "This program takes 4 parameters in this order: \n" \ "filename old_token new_token number_of_iterations\n" \ "It doesn't write to the file at all.\n" \ "The file's encoding is treated according to your locale " \ "i.e. in a Linux/UNIX environment, the LC_CTYPE or LC_ALL environment " \ "variables. For UTF-8 encoding, one possibility is to set " \ "LC_ALL to \"en_AU.UTF-8\".\n"); #if (__STDC_VERSION__ >= 199901L) #include #endif wchar_t *repl_wcs(const wchar_t *str, const wchar_t *from, const wchar_t *to) { /* Adjust each of the below values to suit your needs. */ /* Increment positions cache size initially by this number. */ size_t cache_sz_inc = 16; /* Thereafter, each time capacity needs to be increased, * multiply the increment by this factor. */ const size_t cache_sz_inc_factor = 3; /* But never increment capacity by more than this number. */ const size_t cache_sz_inc_max = 1048576; wchar_t *pret, *ret = NULL; const wchar_t *pstr2, *pstr = str; size_t i, count = 0; #if (__STDC_VERSION__ >= 199901L) uintptr_t *pos_cache_tmp, *pos_cache = NULL; #else ptrdiff_t *pos_cache_tmp, *pos_cache = NULL; #endif size_t cache_sz = 0; size_t cpylen, orglen, retlen, tolen, fromlen = wcslen(from); /* Find all matches and cache their positions. */ while ((pstr2 = wcsstr(pstr, from)) != NULL) { count++; /* Increase the cache size when necessary. */ if (cache_sz < count) { cache_sz += cache_sz_inc; pos_cache_tmp = realloc(pos_cache, sizeof(*pos_cache) * cache_sz); if (pos_cache_tmp == NULL) { goto end_repl_wcs; } else pos_cache = pos_cache_tmp; cache_sz_inc *= cache_sz_inc_factor; if (cache_sz_inc > cache_sz_inc_max) { cache_sz_inc = cache_sz_inc_max; } } pos_cache[count-1] = pstr2 - str; pstr = pstr2 + fromlen; } orglen = pstr - str + wcslen(pstr); /* Allocate memory for the post-replacement string. */ if (count > 0) { tolen = wcslen(to); retlen = orglen + (tolen - fromlen) * count; } else retlen = orglen; ret = malloc((retlen + 1) * sizeof(wchar_t)); if (ret == NULL) { goto end_repl_wcs; } if (count == 0) { /* If no matches, then just duplicate the string. */ wcscpy(ret, str); } else { /* Otherwise, duplicate the string whilst performing * the replacements using the position cache. */ pret = ret; wmemcpy(pret, str, pos_cache[0]); pret += pos_cache[0]; for (i = 0; i < count; i++) { wmemcpy(pret, to, tolen); pret += tolen; pstr = str + pos_cache[i] + fromlen; cpylen = (i == count-1 ? orglen : pos_cache[i+1]) - pos_cache[i] - fromlen; wmemcpy(pret, pstr, cpylen); pret += cpylen; } ret[retlen] = L'\0'; } end_repl_wcs: /* Free the cache and return the post-replacement string, * which will be NULL in the event of an error. */ free(pos_cache); return ret; } /* Converts src from a multibyte string to a wide string, allocating memory for * the wide string in the process, and returning that wide string. The memory * can be deallocated with free() when no longer required. */ wchar_t *conv_mbtowcs(const char *src) { size_t len = strlen(src); wchar_t *ret; /* Get length in wide chars. */ len = mbsrtowcs(NULL, &src, len+1, NULL); if (len == (size_t)(-1)) { perror("conv_mbtowcs: mbsrtowcs: encoding error"); return NULL; } /* Allocate memory for wide char string based on that length. */ if ((ret = malloc((len+1)*sizeof(wchar_t))) == NULL) { perror("conv_mbtowcs: malloc"); return NULL; } /* Now perform the actual multibyte to wide string conversion. */ mbstowcs(ret, src, len+1); return ret; } int main(int argc, char **argv) { FILE *f; char *converrs, *mystr = NULL; wchar_t *mywstr = NULL; wchar_t *oldw, *neww, *newwstr = NULL; const char *filename, *locale; size_t r; long i; long int filelen = DEFSTRSIZE; clock_t start, end; if (argc <= 4) { fprintf(stderr, "Not enough parameters.\n"); printusage; exit(EXIT_FAILURE); } /* Locale must be set so that mbstowcs() can interpret multibyte chars * according to that locale. Calling setlocale() with an empty string * causes the locale to be initialised from the environment in which the * program is run i.e. the user's locale, which in a Linux/UNIX * environment is specified in the LC_TYPE or LC_ALL variable. */ locale = setlocale(LC_CTYPE, ""); if (!locale) perror("setlocale"); else fwprintf(stderr, L"Locale : %s\n", locale); /* Parse and store command-line parameters */ filename = argv[1]; oldw = conv_mbtowcs(argv[2]); if (!oldw) exit(EXIT_FAILURE); neww = conv_mbtowcs(argv[3]); if (!neww) exit(EXIT_FAILURE); i = strtol(argv[4], &converrs, 10); if ((*converrs != '\0' && argv[4][0] != '\0') || i <= 0) { fprintf(stderr, "Unsuccessful conversion of iterations parameter.\n" \ "It must be a positive integer less than %ld.\n", LONG_MAX); printusage; exit(EXIT_FAILURE); } /* Try to determine the specified file's size so we know how much memory * to malloc. Note firstly that this requires opening the file in binary * mode, and secondly that at least the C89 version of the Standard does * not require that a binary stream supports seeking to the end of a * file. If the seek and tell fail, set the file size to DEFSTRSIZE. */ f = fopen(filename, "rb"); if (f == NULL) { perror("fopen"); exit(EXIT_FAILURE); } if (fseek(f, 0, SEEK_END) > 0 || (filelen = ftell(f)) == -1L) { filelen = DEFSTRSIZE; } /* Allocate enough memory to read in the specified file up to the size * determined above. */ if ((mystr = malloc(filelen + 1)) == NULL) { perror("malloc (for file)"); exit(EXIT_FAILURE); } /* Reopen the specified file in text mode. */ if (freopen(filename, "r", f) == NULL) { perror("freopen"); exit(EXIT_FAILURE); } /* Read in as much of specified file as possible */ if ((r = fread(mystr, 1, filelen, f)) == 0) { perror("fread"); exit(EXIT_FAILURE); } mystr[r] = '\0'; if (fclose(f) != 0) perror("fclose"); /* Convert file contents from multibyte to wide char. */ mywstr = conv_mbtowcs(mystr); if (!mywstr) exit(EXIT_FAILURE); /* Free no-longer-required memory for mystr. */ free(mystr); /* Print information prior to calling the function */ fwprintf(stderr, L"Filename : %s\nIterations : %lu\n" \ "Old : %ls\n" \ "New : %ls\nPre-length : %"Z"u\n", filename, i, oldw, neww, PRTF_SZ_CAST wcslen(mywstr)); /* Start timing and call the replace function the number of times * specified */ start = clock(); while (i-- > 0) { free(newwstr); if ((newwstr = repl_wcs(mywstr, oldw, neww)) == NULL) fwprintf(stderr, L"Replace returned NULL.\n"); } /* End timing and print post-call information */ end = clock(); if (newwstr != NULL) { /* Kludge to avoid an odd bug: no matter what value is * supplied to fwprintf for a %f specifier using GCC under Linux * on an ARM architecture, it prints 0.000... */ char duration_str[20]; sprintf(duration_str, "%f", (end - start) / (double) CLOCKS_PER_SEC); fwprintf(stderr, L"Post-length: %"Z"u\n" \ "Duration : %s seconds\n", PRTF_SZ_CAST wcslen(newwstr), duration_str); } /* Free allocated memory and exit with successful result */ free(newwstr); free(mywstr); return EXIT_SUCCESS; }