Docs
String Parsing
String Parsing and Tokenization in C
Table of Contents
- ā¢Introduction
- ā¢Understanding Tokenization
- ā¢The strtok Function
- ā¢Thread-Safe strtok_r
- ā¢Custom Tokenizers
- ā¢Parsing Structured Data
- ā¢Number Parsing
- ā¢Date and Time Parsing
- ā¢Pattern Matching
- ā¢Error Handling
- ā¢Best Practices
- ā¢Summary
Introduction
String parsing is the process of analyzing and extracting meaningful data from text. This is a fundamental skill for:
- ā¢Reading configuration files
- ā¢Processing user input
- ā¢Parsing data formats (CSV, logs, etc.)
- ā¢Implementing simple interpreters
- ā¢Network protocol handling
This module covers tokenization techniques and parsing strategies in C.
Understanding Tokenization
What is Tokenization?
Tokenization breaks a string into smaller pieces called tokens, separated by delimiters.
String: "apple,banana,cherry"
Delimiter: ","
Tokens: ["apple", "banana", "cherry"]
Visual Representation
Input String:
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
ā a p p l e , b a n a n a , c h e r r y ā
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
ā ā
delimiter delimiter
After Tokenization:
Token 1: "apple"
Token 2: "banana"
Token 3: "cherry"
Types of Delimiters
// Single character delimiter
"a,b,c" with "," -> ["a", "b", "c"]
// Multiple character delimiters (any of them)
"a,b;c:d" with ",;:" -> ["a", "b", "c", "d"]
// Whitespace delimiters
"hello world" with " " -> ["hello", "world"]
// Tab and space
"a\tb c" with " \t" -> ["a", "b", "c"]
The strtok Function
Function Signature
#include <string.h>
char *strtok(char *str, const char *delim);
- ā¢str: String to tokenize (first call) or NULL (subsequent calls)
- ā¢delim: String containing delimiter characters
- ā¢Returns: Pointer to next token, or NULL if no more tokens
How strtok Works
- ā¢First call: Pass the string to tokenize
- ā¢Subsequent calls: Pass NULL to continue tokenizing
- ā¢strtok modifies the original string (replaces delimiters with '\0')
- ā¢Uses static variable internally (not thread-safe)
Basic Usage
#include <stdio.h>
#include <string.h>
int main(void) {
char str[] = "apple,banana,cherry";
char *token;
// First call - pass the string
token = strtok(str, ",");
while (token != NULL) {
printf("Token: %s\n", token);
// Subsequent calls - pass NULL
token = strtok(NULL, ",");
}
return 0;
}
Output:
Token: apple
Token: banana
Token: cherry
strtok Modifies the String
char str[] = "one,two,three";
printf("Before: '%s'\n", str); // "one,two,three"
char *token = strtok(str, ",");
printf("After: '%s'\n", str); // "one" (modified!)
// Memory view after strtok:
// 'o' 'n' 'e' '\0' 't' 'w' 'o' '\0' 't' 'h' 'r' 'e' 'e' '\0'
// ā ā
// Delimiter replaced with null
Multiple Delimiters
char str[] = "hello;world,how:are you";
char *token = strtok(str, ";,: "); // Any of these is a delimiter
while (token != NULL) {
printf("'%s'\n", token);
token = strtok(NULL, ";,: ");
}
// Output: 'hello', 'world', 'how', 'are', 'you'
Handling Empty Tokens
Important: strtok skips consecutive delimiters!
char str[] = "a,,b,c";
char *token = strtok(str, ",");
while (token != NULL) {
printf("'%s'\n", token);
token = strtok(NULL, ",");
}
// Output: 'a', 'b', 'c'
// Note: Empty token between "a" and "b" is SKIPPED!
Counting Tokens First
int count_tokens(const char *str, const char *delim) {
// Make a copy since strtok modifies string
char *copy = strdup(str);
if (copy == NULL) return -1;
int count = 0;
char *token = strtok(copy, delim);
while (token != NULL) {
count++;
token = strtok(NULL, delim);
}
free(copy);
return count;
}
Thread-Safe strtok_r
The Problem with strtok
// strtok uses internal static state
// Two threads cannot safely tokenize simultaneously
// Thread 1: strtok(str1, ",")
// Thread 2: strtok(str2, ";")
// They interfere with each other!
strtok_r Solution
#define _POSIX_C_SOURCE 200809L
#include <string.h>
char *strtok_r(char *str, const char *delim, char **saveptr);
- ā¢saveptr: Pointer to char* that stores state between calls
- ā¢Each tokenization has its own state
- ā¢Thread-safe and reentrant
Using strtok_r
char str[] = "one,two,three";
char *saveptr;
char *token;
token = strtok_r(str, ",", &saveptr);
while (token != NULL) {
printf("Token: %s\n", token);
token = strtok_r(NULL, ",", &saveptr);
}
Nested Tokenization
// Parse "name=value;name=value" pairs
char str[] = "user=john;pass=secret;role=admin";
char *saveptr1, *saveptr2;
char *pair, *key, *value;
pair = strtok_r(str, ";", &saveptr1);
while (pair != NULL) {
key = strtok_r(pair, "=", &saveptr2);
value = strtok_r(NULL, "=", &saveptr2);
printf("Key: '%s', Value: '%s'\n", key, value ? value : "(none)");
pair = strtok_r(NULL, ";", &saveptr1);
}
Output:
Key: 'user', Value: 'john'
Key: 'pass', Value: 'secret'
Key: 'role', Value: 'admin'
Custom Tokenizers
Tokenizer That Handles Empty Fields
char *strtok_empty(char *str, char delim, char **saveptr) {
if (str != NULL) {
*saveptr = str;
}
if (*saveptr == NULL || **saveptr == '\0') {
return NULL;
}
char *token_start = *saveptr;
// Find delimiter or end of string
while (**saveptr != delim && **saveptr != '\0') {
(*saveptr)++;
}
if (**saveptr == delim) {
**saveptr = '\0';
(*saveptr)++;
}
return token_start;
}
// Usage
char str[] = "a,,b,c";
char *saveptr;
char *token = strtok_empty(str, ',', &saveptr);
while (token != NULL) {
printf("'%s'\n", token); // 'a', '', 'b', 'c'
token = strtok_empty(NULL, ',', &saveptr);
}
Tokenizer That Preserves Original String
typedef struct {
const char *str;
const char *delim;
size_t current;
} Tokenizer;
void tokenizer_init(Tokenizer *t, const char *str, const char *delim) {
t->str = str;
t->delim = delim;
t->current = 0;
}
// Returns length of token, -1 if done
// Stores token start position in *start
int tokenizer_next(Tokenizer *t, size_t *start) {
if (t->str[t->current] == '\0') {
return -1;
}
*start = t->current;
// Find end of token
while (t->str[t->current] != '\0' &&
strchr(t->delim, t->str[t->current]) == NULL) {
t->current++;
}
size_t len = t->current - *start;
// Skip delimiter
while (t->str[t->current] != '\0' &&
strchr(t->delim, t->str[t->current]) != NULL) {
t->current++;
}
return (int)len;
}
Splitting into Array
int split_string(char *str, char *tokens[], int max_tokens,
const char *delim) {
int count = 0;
char *token = strtok(str, delim);
while (token != NULL && count < max_tokens) {
tokens[count++] = token;
token = strtok(NULL, delim);
}
return count;
}
// Usage
char str[] = "one,two,three,four";
char *tokens[10];
int count = split_string(str, tokens, 10, ",");
for (int i = 0; i < count; i++) {
printf("tokens[%d] = '%s'\n", i, tokens[i]);
}
Parsing Structured Data
Parsing CSV Data
typedef struct {
char name[50];
int age;
double salary;
} Employee;
int parse_csv_line(const char *line, Employee *emp) {
char buffer[256];
strncpy(buffer, line, sizeof(buffer) - 1);
buffer[sizeof(buffer) - 1] = '\0';
char *token;
char *saveptr;
// Parse name
token = strtok_r(buffer, ",", &saveptr);
if (token == NULL) return 0;
strncpy(emp->name, token, sizeof(emp->name) - 1);
// Parse age
token = strtok_r(NULL, ",", &saveptr);
if (token == NULL) return 0;
emp->age = atoi(token);
// Parse salary
token = strtok_r(NULL, ",", &saveptr);
if (token == NULL) return 0;
emp->salary = atof(token);
return 1;
}
// Usage
Employee emp;
if (parse_csv_line("John Doe,30,50000.50", &emp)) {
printf("Name: %s, Age: %d, Salary: %.2f\n",
emp.name, emp.age, emp.salary);
}
Parsing Key-Value Pairs
typedef struct {
char key[64];
char value[256];
} KeyValue;
int parse_key_value(const char *line, KeyValue *kv) {
char buffer[320];
strncpy(buffer, line, sizeof(buffer) - 1);
buffer[sizeof(buffer) - 1] = '\0';
// Find '=' separator
char *eq = strchr(buffer, '=');
if (eq == NULL) return 0;
*eq = '\0';
// Trim and copy key
char *key = buffer;
while (isspace(*key)) key++;
strncpy(kv->key, key, sizeof(kv->key) - 1);
// Trim trailing spaces
size_t len = strlen(kv->key);
while (len > 0 && isspace(kv->key[len - 1])) {
kv->key[--len] = '\0';
}
// Trim and copy value
char *value = eq + 1;
while (isspace(*value)) value++;
strncpy(kv->value, value, sizeof(kv->value) - 1);
len = strlen(kv->value);
while (len > 0 && isspace(kv->value[len - 1])) {
kv->value[--len] = '\0';
}
return 1;
}
Parsing Configuration Files
#include <stdio.h>
#include <string.h>
#include <ctype.h>
typedef struct {
char host[64];
int port;
char username[32];
char password[64];
} Config;
int parse_config(FILE *file, Config *config) {
char line[256];
// Initialize defaults
strcpy(config->host, "localhost");
config->port = 8080;
config->username[0] = '\0';
config->password[0] = '\0';
while (fgets(line, sizeof(line), file)) {
// Skip empty lines and comments
char *p = line;
while (isspace(*p)) p++;
if (*p == '\0' || *p == '#') continue;
// Remove newline
line[strcspn(line, "\n")] = '\0';
// Parse key=value
char *eq = strchr(line, '=');
if (eq == NULL) continue;
*eq = '\0';
char *key = line;
char *value = eq + 1;
// Trim
while (isspace(*key)) key++;
while (isspace(*value)) value++;
if (strcmp(key, "host") == 0) {
strncpy(config->host, value, sizeof(config->host) - 1);
} else if (strcmp(key, "port") == 0) {
config->port = atoi(value);
} else if (strcmp(key, "username") == 0) {
strncpy(config->username, value, sizeof(config->username) - 1);
} else if (strcmp(key, "password") == 0) {
strncpy(config->password, value, sizeof(config->password) - 1);
}
}
return 1;
}
Number Parsing
Basic Number Conversion Functions
#include <stdlib.h>
int i = atoi("123"); // String to int
long l = atol("123456789"); // String to long
double d = atof("3.14"); // String to double
// Note: ato* functions have no error detection!
Safe Number Parsing with strtol
#include <stdlib.h>
#include <errno.h>
#include <limits.h>
int parse_int_safe(const char *str, int *result) {
char *endptr;
errno = 0;
long value = strtol(str, &endptr, 10);
// Check for errors
if (errno == ERANGE) {
return 0; // Overflow or underflow
}
if (endptr == str) {
return 0; // No digits found
}
if (*endptr != '\0') {
return 0; // Extra characters after number
}
if (value < INT_MIN || value > INT_MAX) {
return 0; // Out of int range
}
*result = (int)value;
return 1;
}
// Usage
int num;
if (parse_int_safe("123", &num)) {
printf("Parsed: %d\n", num);
} else {
printf("Invalid number\n");
}
Parsing Different Bases
// strtol supports bases 2-36
long value;
char *endptr;
value = strtol("1010", &endptr, 2); // Binary: 10
value = strtol("777", &endptr, 8); // Octal: 511
value = strtol("FF", &endptr, 16); // Hex: 255
value = strtol("0xFF", &endptr, 0); // Auto-detect: 255
value = strtol("0777", &endptr, 0); // Auto-detect octal: 511
Parsing Floating-Point Numbers
#include <stdlib.h>
#include <errno.h>
int parse_double_safe(const char *str, double *result) {
char *endptr;
errno = 0;
double value = strtod(str, &endptr);
if (errno == ERANGE) {
return 0; // Overflow or underflow
}
if (endptr == str) {
return 0; // No digits found
}
if (*endptr != '\0') {
return 0; // Extra characters
}
*result = value;
return 1;
}
// Handles: "3.14", "-2.5e10", ".5", "1e-5"
Date and Time Parsing
Simple Date Parsing
typedef struct {
int year;
int month;
int day;
} Date;
int parse_date(const char *str, Date *date) {
// Format: YYYY-MM-DD
if (strlen(str) != 10) return 0;
if (str[4] != '-' || str[7] != '-') return 0;
char year_str[5], month_str[3], day_str[3];
strncpy(year_str, str, 4);
year_str[4] = '\0';
strncpy(month_str, str + 5, 2);
month_str[2] = '\0';
strncpy(day_str, str + 8, 2);
day_str[2] = '\0';
date->year = atoi(year_str);
date->month = atoi(month_str);
date->day = atoi(day_str);
// Validate
if (date->month < 1 || date->month > 12) return 0;
if (date->day < 1 || date->day > 31) return 0;
return 1;
}
Using sscanf for Date Parsing
int parse_date_scanf(const char *str, Date *date) {
if (sscanf(str, "%d-%d-%d",
&date->year, &date->month, &date->day) != 3) {
return 0;
}
// Validate
if (date->month < 1 || date->month > 12) return 0;
if (date->day < 1 || date->day > 31) return 0;
return 1;
}
Time Parsing
typedef struct {
int hour;
int minute;
int second;
} Time;
int parse_time(const char *str, Time *time) {
// Formats: HH:MM or HH:MM:SS
int fields = sscanf(str, "%d:%d:%d",
&time->hour, &time->minute, &time->second);
if (fields < 2) return 0;
if (fields == 2) time->second = 0;
// Validate
if (time->hour < 0 || time->hour > 23) return 0;
if (time->minute < 0 || time->minute > 59) return 0;
if (time->second < 0 || time->second > 59) return 0;
return 1;
}
Pattern Matching
Simple Pattern Matching
// Match pattern with single character wildcard (?)
int match_simple(const char *pattern, const char *str) {
while (*pattern && *str) {
if (*pattern == '?') {
// Match any single character
pattern++;
str++;
} else if (*pattern == *str) {
pattern++;
str++;
} else {
return 0;
}
}
return *pattern == '\0' && *str == '\0';
}
// Usage
match_simple("h?llo", "hello"); // true
match_simple("h?llo", "hallo"); // true
match_simple("h?llo", "hxllo"); // true
match_simple("h??lo", "hello"); // true
Wildcard Pattern Matching
// Match with * (any sequence) and ? (single char)
int match_wildcard(const char *pattern, const char *str) {
while (*pattern && *str) {
if (*pattern == '*') {
// Try matching rest with remaining pattern
pattern++;
if (*pattern == '\0') return 1; // * at end matches all
while (*str) {
if (match_wildcard(pattern, str)) return 1;
str++;
}
return match_wildcard(pattern, str);
} else if (*pattern == '?' || *pattern == *str) {
pattern++;
str++;
} else {
return 0;
}
}
// Handle trailing *
while (*pattern == '*') pattern++;
return *pattern == '\0' && *str == '\0';
}
// Usage
match_wildcard("*.txt", "file.txt"); // true
match_wildcard("test*", "testing123"); // true
match_wildcard("*hello*", "say hello now"); // true
Extracting with sscanf
// sscanf as simple pattern extractor
char name[50];
int age;
double score;
// Parse formatted string
sscanf("John,25,95.5", "%[^,],%d,%lf", name, &age, &score);
printf("Name: %s, Age: %d, Score: %.1f\n", name, age, score);
// Parse with literal text
int x, y;
sscanf("Point(10, 20)", "Point(%d, %d)", &x, &y);
printf("x=%d, y=%d\n", x, y);
// Extract IP address
int a, b, c, d;
sscanf("192.168.1.100", "%d.%d.%d.%d", &a, &b, &c, &d);
Error Handling
Validation Before Parsing
int is_valid_integer(const char *str) {
if (*str == '-' || *str == '+') str++;
if (*str == '\0') return 0; // Empty or only sign
while (*str) {
if (!isdigit(*str)) return 0;
str++;
}
return 1;
}
int is_valid_float(const char *str) {
int has_digit = 0;
int has_dot = 0;
int has_exp = 0;
if (*str == '-' || *str == '+') str++;
while (*str) {
if (isdigit(*str)) {
has_digit = 1;
} else if (*str == '.') {
if (has_dot || has_exp) return 0;
has_dot = 1;
} else if (*str == 'e' || *str == 'E') {
if (!has_digit || has_exp) return 0;
has_exp = 1;
has_digit = 0; // Need digits after 'e'
if (*(str + 1) == '+' || *(str + 1) == '-') str++;
} else {
return 0;
}
str++;
}
return has_digit;
}
Parser Return Codes
typedef enum {
PARSE_OK = 0,
PARSE_EMPTY,
PARSE_INVALID_FORMAT,
PARSE_OVERFLOW,
PARSE_MISSING_FIELD,
PARSE_EXTRA_DATA
} ParseResult;
ParseResult parse_record(const char *line, Record *rec) {
if (line == NULL || *line == '\0') {
return PARSE_EMPTY;
}
char buffer[256];
strncpy(buffer, line, sizeof(buffer) - 1);
char *token = strtok(buffer, ",");
if (token == NULL) return PARSE_MISSING_FIELD;
// ... parse fields ...
// Check for extra data
if (strtok(NULL, ",") != NULL) {
return PARSE_EXTRA_DATA;
}
return PARSE_OK;
}
const char *parse_error_string(ParseResult result) {
switch (result) {
case PARSE_OK: return "Success";
case PARSE_EMPTY: return "Empty input";
case PARSE_INVALID_FORMAT: return "Invalid format";
case PARSE_OVERFLOW: return "Number overflow";
case PARSE_MISSING_FIELD: return "Missing required field";
case PARSE_EXTRA_DATA: return "Extra data at end";
default: return "Unknown error";
}
}
Best Practices
1. Always Copy Before Tokenizing
// GOOD: Copy string before strtok
void parse_safe(const char *input) {
char *copy = strdup(input);
if (copy == NULL) return;
char *token = strtok(copy, ",");
while (token) {
process(token);
token = strtok(NULL, ",");
}
free(copy);
}
// BAD: Modifying input
void parse_bad(char *input) {
char *token = strtok(input, ","); // Modifies input!
// ...
}
2. Check Return Values
// GOOD: Check every parse step
int parse_carefully(const char *str, int *a, int *b) {
if (str == NULL) return 0;
if (sscanf(str, "%d,%d", a, b) != 2) {
return 0; // Failed to parse both numbers
}
return 1;
}
3. Handle Buffer Sizes
// GOOD: Limit string sizes
#define MAX_NAME 50
int parse_name(const char *input, char *name) {
// %49s prevents buffer overflow
if (sscanf(input, "%49s", name) != 1) {
return 0;
}
return 1;
}
4. Use strtok_r for Complex Parsing
// GOOD: Nested parsing with strtok_r
void parse_nested(char *str) {
char *outer_save, *inner_save;
char *outer = strtok_r(str, ";", &outer_save);
while (outer) {
char *inner = strtok_r(outer, ",", &inner_save);
while (inner) {
process(inner);
inner = strtok_r(NULL, ",", &inner_save);
}
outer = strtok_r(NULL, ";", &outer_save);
}
}
5. Validate Input Ranges
int parse_port(const char *str, int *port) {
int value;
if (!parse_int_safe(str, &value)) {
return 0;
}
if (value < 1 || value > 65535) {
return 0; // Invalid port range
}
*port = value;
return 1;
}
Summary
Key Functions
| Function | Purpose | Thread-Safe |
|---|---|---|
strtok() | Basic tokenization | No |
strtok_r() | Thread-safe tokenization | Yes |
sscanf() | Pattern-based parsing | Yes |
strtol() | Safe integer parsing | Yes |
strtod() | Safe float parsing | Yes |
atoi/atof() | Quick conversion (no error check) | Yes |
Parsing Patterns
// CSV line
char *fields[10];
int count = split(line, fields, 10, ",");
// Key-value
char *eq = strchr(line, '=');
*eq = '\0';
char *key = line, *value = eq + 1;
// Numbers with validation
int num;
if (parse_int_safe(str, &num)) { ... }
// Formatted data
int x, y;
sscanf(str, "Point(%d,%d)", &x, &y);
Error Handling Checklist
- ⢠Check for NULL input
- ⢠Validate string format
- ⢠Check parse return values
- ⢠Validate ranges
- ⢠Handle empty fields
- ⢠Free temporary allocations
- ⢠Use thread-safe functions in threaded code