HTTP Parser and message builder / objects in plain C Snapshot
|
00001 #include "uri.h" 00002 #include "charclass.h" 00003 #include <string.h> 00004 #include <arpa/inet.h> 00005 #include "sutils.h" 00006 00007 00008 00009 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 00010 M_INLINE int is_mark(int8_t ch) { 00011 return ch == '-' || ch == '_' || ch == '.' || ch == '!' || ch == '~' || ch == '*' || ch == '\'' || ch == '(' || ch == ')'; 00012 } 00013 00014 // unreserved = alphanum | mark 00015 M_INLINE int is_unreserved(int8_t ch) { 00016 return is_alphanum(ch) || is_mark(ch); 00017 } 00018 00019 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," 00020 M_INLINE int is_reserved(int8_t ch) { 00021 return ch == ';' || ch == '/' || ch == '?' || ch == ':' || ch == '@' || ch == '&' || ch == '=' || ch == '+' || ch == '$' || ch == ','; 00022 } 00023 00024 00025 typedef struct tagURIPARSECTX { 00026 URI *rep; 00027 char *cdata_pos,*cdata_pos_start,*cdata_pos_off; 00028 char *cdata_raw_pos; 00029 00030 char *cdata_is_escaped_start; 00031 00032 } URIPARSECTX; 00033 00034 00035 char *ctx_copy_string_raw( URIPARSECTX *ctx, char *start, char *end ) 00036 { 00037 char *ret = ctx->cdata_raw_pos; 00038 00039 strncpy( ctx->cdata_raw_pos, start, end - start ); 00040 ctx->cdata_raw_pos += end - start; 00041 * ctx->cdata_raw_pos ++ ='\0'; 00042 00043 return ret; 00044 } 00045 00046 void ctx_add_escaped_char( URIPARSECTX *ctx, char ch, int char_encoded ) 00047 { 00048 M_UNUSED( char_encoded ); 00049 * ctx->cdata_pos = ch; 00050 00051 * ( ctx->cdata_is_escaped_start + ( ctx->cdata_pos - ctx->cdata_pos_off ) ) = (char) char_encoded; 00052 00053 ++ ctx->cdata_pos; 00054 00055 //ctx->cdata_pos - ctx->cdata_pos_start 00056 } 00057 00058 char *ctx_finish_escaped_string( URIPARSECTX *ctx ) 00059 { 00060 char *ret = ctx->cdata_pos_start; 00061 00062 * ctx->cdata_pos ++ = '\0'; 00063 ctx->cdata_pos_start = ctx->cdata_pos; 00064 00065 return ret; 00066 } 00067 00068 void ctx_undo_escaped_string( URIPARSECTX *ctx ) 00069 { 00070 ctx->cdata_pos = ctx->cdata_pos_start; 00071 } 00072 00073 // escaped = "%" hex hex 00074 00075 M_INLINE int parse_escaped( URIPARSECTX *ctx, char *ptr, char ** next ) 00076 { 00077 int high,low; 00078 int unescaped_char; 00079 00080 if (*ptr != '%') { 00081 return 1; 00082 } 00083 00084 high = is_hex_ext( *(ptr + 1) ); 00085 if (! high ) { 00086 return -1; 00087 } 00088 00089 low = is_hex_ext( *(ptr + 2) ); 00090 if (! low ) { 00091 return -1; 00092 } 00093 00094 unescaped_char = (high << 4) | low; 00095 if (! (unescaped_char >=0 && unescaped_char <= 0x1F) ) { 00096 ctx_add_escaped_char( ctx, unescaped_char, 1 ); 00097 } 00098 00099 *next = ptr + 3; 00100 return 0; 00101 } 00102 00103 // uric = reserved | unreserved | escaped 00104 M_INLINE int parse_uric( URIPARSECTX *ctx, char *ptr, char **next ) 00105 { 00106 if (is_reserved( *ptr ) || is_unreserved( *ptr ) ) { 00107 ctx_add_escaped_char( ctx, *ptr, 0 ); 00108 *next = ptr +1; 00109 return 0; 00110 } 00111 return parse_escaped( ctx, ptr, next ); 00112 } 00113 00114 00115 M_INLINE int parse_uric_sequence( URIPARSECTX *ctx, char *ptr, char **next ) 00116 { 00117 int rt; 00118 00119 while( (rt = parse_uric( ctx, ptr, next )) == 0 ) { 00120 ptr = *next; 00121 } 00122 return rt; 00123 } 00124 00125 // pchar = unreserved | escaped | 00126 // ":" | "@" | "&" | "=" | "+" | "$" | "," 00127 M_INLINE int parse_pchar( URIPARSECTX *ctx, char *ptr, char **next) 00128 { 00129 char ch = *ptr; 00130 00131 if (is_unreserved( ch ) || ch == ':' || ch == '@' || ch == '&' || ch == '=' || ch == '+' || ch == '$' || ch == ',') { 00132 ctx_add_escaped_char( ctx, ch, 0 ); 00133 *next = ptr + 1; 00134 return 0; 00135 } 00136 return parse_escaped( ctx, ptr, next ); 00137 } 00138 00139 M_INLINE int parse_pchar_sequence( URIPARSECTX *ctx, char *ptr, char **next ) 00140 { 00141 int rt; 00142 00143 while( (rt = parse_pchar( ctx, ptr, next )) == 0 ) { 00144 ptr = *next; 00145 } 00146 return 0; 00147 } 00148 00149 // segment = *pchar *( ";" param ) 00150 M_INLINE int parse_segment( URIPARSECTX *ctx, char *ptr, char **next ) 00151 { 00152 if (parse_pchar_sequence( ctx, ptr, next ) < 0) { 00153 return -1; 00154 } 00155 00156 ptr = *next; 00157 if (*ptr == ';') { 00158 ctx_add_escaped_char( ctx, ';', 0 ); 00159 ptr = *next = ptr + 1; 00160 if (parse_pchar_sequence( ctx, ptr, next ) < 0) { 00161 return -1; 00162 } 00163 } 00164 return 0; 00165 } 00166 00167 //path_segments = segment *( "/" segment ) 00168 M_INLINE int parse_path_segments( URIPARSECTX *ctx, char *ptr, char **next ) { 00169 if (parse_segment( ctx, ptr, next ) < 0) { 00170 return -1; 00171 } 00172 ptr = *next; 00173 while (*ptr == '/') { 00174 ctx_add_escaped_char( ctx, '/', 0 ); 00175 ptr = *next = ptr + 1; 00176 if (parse_segment( ctx, ptr, next ) < 0) { 00177 return -1; 00178 } 00179 ptr = *next; 00180 } 00181 return 0; 00182 } 00183 00184 M_INLINE int parse_ipv4_address( URIPARSECTX *ctx, char *ptr, char **next ) { 00185 int i; 00186 00187 char *start = ptr; 00188 00189 for (i=0; i < 4; i++) { 00190 if (! is_digit(*ptr)) { 00191 return -1; 00192 } 00193 for( ++ ptr; is_digit( *ptr ); ++ptr ); 00194 if ( i == 3) { 00195 break; 00196 } 00197 if (*ptr != '.') { 00198 return -1; 00199 } 00200 ++ ptr; 00201 } 00202 00203 *next = ptr; 00204 ctx->rep->flags |= URI_FLAGS_HOST_IPv4; 00205 ctx->rep->host = ctx_copy_string_raw(ctx, start, ptr ); 00206 00207 return 0; 00208 } 00209 00210 //[ userinfo "@" ] 00211 //userinfo = *( unreserved | escaped | 00212 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 00213 M_INLINE int parse_userinfo( URIPARSECTX *ctx, char *ptr, char **next) 00214 { 00215 char *start = ptr; 00216 int rt; 00217 00218 while ( *ptr != '@' ) { 00219 if ( is_unreserved( *ptr ) || *ptr == ';' || *ptr == ':' || *ptr == '&' 00220 || *ptr == '=' || *ptr == '+' || *ptr == '$' || *ptr == ',') { 00221 ctx_add_escaped_char( ctx, *ptr, 0 ); 00222 ++ ptr; 00223 continue; 00224 } 00225 if ((rt = parse_escaped( ctx, ptr, next )) != 0) { 00226 ctx_undo_escaped_string( ctx ); 00227 *next = start; 00228 return rt; 00229 } 00230 ptr = *next; 00231 } 00232 00233 ctx->rep->userinfo_raw = ctx_copy_string_raw(ctx, start, ptr ); 00234 ctx->rep->userinfo = ctx_finish_escaped_string(ctx); 00235 *next = ptr + 1; 00236 00237 return 0; 00238 } 00239 00240 00241 M_INLINE int parse_domainlabel( char *ptr, char **next ) 00242 { 00243 if (!is_alphanum( *ptr) ) { 00244 return -1; 00245 } 00246 for(ptr += 1; is_alphanum( *ptr ) || *ptr == '-'; ++ptr); 00247 00248 if (!is_alphanum( *(ptr-1) ) ) { 00249 return -1; 00250 } 00251 *next = ptr; 00252 00253 return 0; 00254 } 00255 00256 //hostname = *( domainlabel "." ) toplabel [ "." ] 00257 //domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 00258 //toplabel = alpha | alpha *( alphanum | "-" ) alphanum 00259 M_INLINE int parse_hostname( URIPARSECTX *ctx, char *ptr, char **next ) 00260 { 00261 char *last_component; 00262 char *start = ptr; 00263 00264 for ( ; ; ) { 00265 last_component = ptr; 00266 if (*ptr == '/') { 00267 goto ok; 00268 } 00269 if (parse_domainlabel( ptr, next) < 0) { 00270 return -1; 00271 } 00272 ptr = *next; 00273 00274 if (*ptr != '.') { 00275 break; 00276 } 00277 ++ptr; 00278 } 00279 // check that last component is top label 00280 if (is_digit( * last_component ) ) { 00281 return -1; 00282 } 00283 00284 ok: 00285 *next = ptr; 00286 00287 ctx->rep->flags |= URI_FLAGS_HOST_HOSTNAME; 00288 ctx->rep->host = ctx_copy_string_raw(ctx, start, ptr ); 00289 00290 return 0; 00291 } 00292 00293 00294 M_INLINE int parse_ipv6_address( URIPARSECTX *ctx, char *ptr, char **next ) 00295 { 00296 struct in6_addr addr; 00297 char *dup, *start; 00298 int rt; 00299 00300 00301 if (*ptr != '[') { 00302 return 1; 00303 } 00304 00305 start = ptr; 00306 for(;*ptr != ']' && *ptr != '\0'; ++ ptr); 00307 if (*ptr != ']') { 00308 return -1; 00309 } 00310 00311 dup = strdup_range( start+1, ptr ); 00312 if (!dup) { 00313 return -1; 00314 } 00315 rt = inet_pton( AF_INET6, dup, &addr); 00316 free(dup); 00317 if (rt == 1) { 00318 ctx->rep->flags |= URI_FLAGS_HOST_IPv6; 00319 ctx->rep->host = ctx_copy_string_raw(ctx, start, ptr ); 00320 00321 *next = ptr + 1; 00322 return 0; 00323 } 00324 return -1; 00325 } 00326 00327 00328 00329 //hostport = host [ ":" port ] 00330 //host = hostname | IPv4address | Ipv6address 00331 M_INLINE int parse_hostport( URIPARSECTX *ctx, char *ptr, char **next ) 00332 { 00333 char *start; 00334 00335 if ( ! parse_ipv4_address( ctx, ptr, next ) ) { 00336 goto pport; 00337 } 00338 00339 if ( ! parse_hostname( ctx, ptr, next) ) { 00340 goto pport; 00341 } 00342 00343 if ( ! parse_ipv6_address( ctx, ptr, next ) ) { 00344 goto pport; 00345 } 00346 00347 return -1; 00348 00349 pport: 00350 00351 ptr = *next; 00352 00353 if (*ptr != ':' ) { 00354 return 0; 00355 } 00356 00357 for( start = ptr = ptr + 1; is_digit( *ptr ); ++ ptr ); 00358 *next = ptr; 00359 00360 ctx->rep->port = atoi( start ); 00361 return 0; 00362 } 00363 00364 M_INLINE int parse_server( URIPARSECTX *ctx, char *ptr ,char **next ) 00365 { 00366 int rt ; 00367 00368 rt = parse_userinfo(ctx, ptr, next); 00369 if (rt < 0) { 00370 return -1; 00371 } 00372 ptr = *next; 00373 return parse_hostport( ctx, ptr, next); 00374 } 00375 00376 00377 M_INLINE int parse_authority( URIPARSECTX *ctx, char *ptr, char **next ) 00378 { 00379 if (!parse_server( ctx, ptr, next)) { 00380 return 0; 00381 } 00382 return -1; 00383 } 00384 00385 00386 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) ":" 00387 M_INLINE int parse_scheme( URIPARSECTX *ctx, char *line, char **next ) 00388 { 00389 char *start = line; 00390 00391 if (is_alpha( *line )) { 00392 ++line; 00393 while( is_alphanum( *line ) || *line == '+' || *line == '-' || *line == '.') { 00394 ++line; 00395 } 00396 if ( *line == ':') { 00397 ctx->rep->flags |= URI_FLAGS_HAS_SCHEME; 00398 ctx->rep->scheme = ctx_copy_string_raw(ctx, start, line ); 00399 * next = line + 1; 00400 return 0; 00401 } 00402 } 00403 00404 return -1; 00405 } 00406 00407 M_INLINE int parse_abs_path( URIPARSECTX *ctx, char *ptr, char **next ) 00408 { 00409 char *start; 00410 00411 start = ptr; 00412 00413 if (*ptr == '/') { 00414 ctx_add_escaped_char( ctx, '/', 0 ); 00415 *next = ++ptr; 00416 } 00417 if (parse_path_segments( ctx, ptr, next ) < 0) { 00418 return -1; 00419 } 00420 if (*next != ptr) { 00421 ctx->rep->path_raw = ctx_copy_string_raw(ctx, start, *next ); 00422 ctx->rep->path = ctx_finish_escaped_string(ctx); 00423 } 00424 return 0; 00425 } 00426 00427 // authority [ abs_path ] 00428 M_INLINE int parse_net_path( URIPARSECTX *ctx, char *ptr , char **next ) 00429 { 00430 if (! parse_authority( ctx, ptr, next )) { 00431 ptr = *next; 00432 if (*ptr == '/') { 00433 return parse_abs_path( ctx, ptr, next ); 00434 } 00435 } 00436 00437 return 0; 00438 } 00439 00440 M_INLINE int parse_uric_no_slash( URIPARSECTX *ctx, char *ptr, char **next ) 00441 { 00442 if (is_unreserved( *ptr ) || *ptr == ';' || *ptr == '?' || *ptr == ':' || *ptr == '@' || *ptr == '&' 00443 || *ptr == '=' || *ptr == '+' || *ptr == '$' || *ptr == ',') { 00444 ctx_add_escaped_char( ctx, *ptr, 0); 00445 *next = ptr +1; 00446 return 0; 00447 } 00448 return parse_escaped( ctx, ptr, next ); 00449 } 00450 00451 00452 00453 int parse_opaque_part( URIPARSECTX *ctx, char *ptr, char **next ) 00454 { 00455 char *start = ptr; 00456 int rt; 00457 if ( (rt = parse_uric_no_slash( ctx, ptr, next )) != 0 ) { 00458 return rt; 00459 } 00460 if (parse_uric_sequence( ctx, ptr, next ) < 0) { 00461 return -1; 00462 } 00463 ctx->rep->flags |= URI_FLAGS_IS_OPAQUE; 00464 ctx->rep->opaque_raw = ctx_copy_string_raw(ctx, start, ptr ); 00465 ctx->rep->opaque = ctx_finish_escaped_string(ctx); 00466 00467 00468 return 0; 00469 } 00470 00471 00472 int parse_hier_part( URIPARSECTX *ctx, char *ptr , char **next, int parse_opaque) 00473 { 00474 char *start; 00475 00476 if (ptr[0] == '/') { 00477 if (ptr[1] == '/') { 00478 if (parse_net_path( ctx, ptr + 2, next )) { 00479 return -1; 00480 } 00481 } else { 00482 //ctx_add_escaped_char( ctx, '/', 0 ); 00483 if (parse_abs_path( ctx, ptr, next )) { 00484 return -1; 00485 } 00486 } 00487 } else { 00488 if (parse_opaque) { 00489 return parse_opaque_part( ctx, ptr, next ); 00490 } 00491 return -1; 00492 } 00493 00494 ptr = *next; 00495 if (*ptr == '?') { 00496 00497 ptr ++; 00498 start = ptr; 00499 if ( parse_uric_sequence( ctx, ptr, next ) == -1 ) { 00500 return -1; 00501 } 00502 ptr = *next; 00503 ctx->rep->query_raw = ctx_copy_string_raw(ctx, start, ptr ); 00504 ctx->rep->query = ctx_finish_escaped_string(ctx); 00505 } 00506 00507 if ( *ptr != '#' ) { 00508 return 0; 00509 } 00510 00511 ++ ptr; 00512 start = ptr; 00513 if (parse_uric_sequence( ctx, ptr, next ) == -1) { 00514 return -1; 00515 } 00516 ptr = *next; 00517 ctx->rep->fragment_raw = ctx_copy_string_raw(ctx, start, ptr ); 00518 ctx->rep->fragment = ctx_finish_escaped_string(ctx); 00519 00520 return 0; 00521 } 00522 00523 int URI_parse( URI *url, char *line ) 00524 { 00525 URI_init( url ); 00526 URIPARSECTX ctx; 00527 char *ptr,*next; 00528 size_t slen; 00529 00530 ctx.rep = url; 00531 00532 slen = strlen( line ); 00533 url->cdata = (char *) malloc( slen ); 00534 if (!url->cdata) { 00535 goto err; 00536 } 00537 00538 ctx.cdata_pos_off = ctx.cdata_pos_start = ctx.cdata_pos = url->cdata; 00539 00540 url->cdata_raw = (char *) malloc( slen ); 00541 if (!url->cdata_raw) { 00542 goto err; 00543 } 00544 00545 url->cdata_is_escaped = (char *) malloc( slen ); 00546 if (!url->cdata_is_escaped) { 00547 goto err; 00548 } 00549 memset( url->cdata_is_escaped, 0, slen ); 00550 ctx.cdata_is_escaped_start = url->cdata_is_escaped; 00551 00552 ctx.cdata_raw_pos = url->cdata_raw; 00553 00554 00555 ptr = line; 00556 if (ptr[0] == '/') { 00557 if ( parse_hier_part( &ctx, ptr, &next, 0) ) { 00558 goto err; 00559 } 00560 } else { 00561 if (! parse_scheme( &ctx, ptr, &next )) { 00562 ptr = next; 00563 if ( parse_hier_part( &ctx, ptr, &next, 1 ) ) { 00564 goto err; 00565 } 00566 } else { 00567 if ( parse_authority( &ctx, ptr, &next )) { 00568 goto err; 00569 } 00570 } 00571 } 00572 00573 if (*next == '\0') { 00574 return 0; 00575 } 00576 00577 err: 00578 URI_free( url ); 00579 return -1; 00580 } 00581 00582