skipfish/http_client.c

2456 lines
62 KiB
C

/*
skipfish - high-performance, single-process asynchronous HTTP client
--------------------------------------------------------------------
Author: Michal Zalewski <lcamtuf@google.com>
Copyright 2009, 2010 by Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/poll.h>
#include <fcntl.h>
#include <sys/time.h>
#include <time.h>
#include <openssl/ssl.h>
#include <openssl/err.h>
#include <idna.h>
#include <zlib.h>
#include "types.h"
#include "alloc-inl.h"
#include "string-inl.h"
#include "database.h"
#include "http_client.h"
/* Assorted exported settings: */
u32 max_connections = MAX_CONNECTIONS,
max_conn_host = MAX_CONN_HOST,
max_requests = MAX_REQUESTS,
max_fail = MAX_FAIL,
idle_tmout = IDLE_TMOUT,
resp_tmout = RESP_TMOUT,
rw_tmout = RW_TMOUT,
size_limit = SIZE_LIMIT;
u8 browser_type = BROWSER_FAST;
u8 auth_type = AUTH_NONE;
struct param_array global_http_par;
/* Counters: */
u32 req_errors_net,
req_errors_http,
req_errors_cur,
req_count,
req_dropped,
queue_cur,
conn_cur,
conn_count,
conn_idle_tmout,
conn_busy_tmout,
conn_failed,
req_retried,
url_scope;
u64 bytes_sent,
bytes_recv,
bytes_deflated,
bytes_inflated;
u8 *auth_user,
*auth_pass;
u8 ignore_cookies;
/* Internal globals for queue management: */
static struct queue_entry* queue;
static struct conn_entry* conn;
static struct dns_entry* dns;
#ifdef QUEUE_FILO
static struct queue_entry* q_tail;
#endif /* QUEUE_FILO */
static u8 tear_down_idle;
/* Extracts parameter value from param_array. Name is matched if
non-NULL. Returns pointer to value data, not a duplicate string;
NULL if no match found. */
u8* get_value(u8 type, u8* name, u32 offset,
struct param_array* par) {
u32 i, coff = 0;
for (i=0;i<par->c;i++) {
if (type != par->t[i]) continue;
if (name && strcasecmp((char*)par->n[i], (char*)name)) continue;
if (offset != coff) { coff++; continue; }
return par->v[i];
}
return NULL;
}
/* Inserts or overwrites parameter value in param_array. If offset
== -1, will append parameter to list. Duplicates strings,
name and val can be NULL. */
void set_value(u8 type, u8* name, u8* val,
s32 offset, struct param_array* par) {
u32 i, coff = 0, matched = -1;
/* If offset specified, try to find an entry to replace. */
if (offset >= 0)
for (i=0;i<par->c;i++) {
if (type != par->t[i]) continue;
if (name && strcasecmp((char*)par->n[i], (char*)name)) continue;
if (offset != coff) { coff++; continue; }
matched = i;
break;
}
if (matched == -1) {
/* No offset or no match - append to the end of list. */
par->t = ck_realloc(par->t, (par->c + 1) * sizeof(u8));
par->n = ck_realloc(par->n, (par->c + 1) * sizeof(u8*));
par->v = ck_realloc(par->v, (par->c + 1) * sizeof(u8*));
par->t[par->c] = type;
par->n[par->c] = ck_strdup(name);
par->v[par->c] = ck_strdup(val);
par->c++;
} else {
/* Matched - replace name & value. */
ck_free(par->n[matched]);
ck_free(par->v[matched]);
par->n[matched] = ck_strdup(name);
par->v[matched] = ck_strdup(val);
}
}
/* Convert a fully-qualified or relative URL string to a proper http_request
representation. Returns 0 on success, 1 on format error. */
u8 parse_url(u8* url, struct http_request* req, struct http_request* ref) {
u8* cur = url;
u32 maybe_proto = strcspn((char*)url, ":/?#@");
u8 has_host = 0, add_slash = 1;
if (strlen((char*)url) > MAX_URL_LEN) return 1;
req->orig_url = ck_strdup(url);
/* Interpret, skip protocol string if the URL seems to be fully-qualified;
otherwise, copy from referring URL. We could be stricter here, as
browsers bail out on seemingly invalid chars in proto names, but... */
if (maybe_proto && url[maybe_proto] == ':') {
if (!strncasecmp((char*)url, "http:", 5)) {
req->proto = PROTO_HTTP;
cur += 5;
} else if (!strncasecmp((char*)url, "https:", 6)) {
req->proto = PROTO_HTTPS;
cur += 6;
} else return 1;
} else {
if (!ref || !ref->proto) return 1;
req->proto = ref->proto;
}
/* Interpret, skip //[login[:pass@](\[ipv4\]|\[ipv6\]|host)[:port] part of the
URL, if present. Note that "http:blarg" is a valid relative URL to most
browsers, and "//example.com/blarg" is a valid non-FQDN absolute one.
We need to mimick this, which complicates the code a bit. */
if (cur[0] == '/' && cur[1] == '/') {
u32 path_st;
u8 *at_sign, *host, *x;
u8 has_utf = 0;
cur += 2;
/* Detect, skip login[:pass]@; we only use cmdline-supplied credentials or
wordlists into account. Be sure to report any embedded auth, though. */
at_sign = (u8*)strchr((char*)cur, '@');
path_st = strcspn((char*)cur, ":/?#");
if (at_sign && path_st > (at_sign - cur)) {
cur = at_sign + 1;
if (!req->pivot) return 1;
problem(PROB_URL_AUTH, ref, 0, url, req->pivot, 0);
}
/* No support for IPv6 or [ip] notation for now, so let's just refuse to
parse the URL. Also, refuse excessively long domain names for sanity. */
if (*cur == '[') return 1;
if (path_st > MAX_DNS_LEN) return 1;
x = host = ck_memdup(cur, path_st + 1);
host[path_st] = 0;
/* Scan, normalize extracted host name. */
while (*x) {
switch (*x) {
case 'A' ... 'Z':
*x = tolower(*x);
break;
case 'a' ... 'z':
case '0' ... '9':
case '.':
case '-':
case '_':
break;
case 0x80 ... 0xff:
has_utf = 1;
break;
default:
/* Uh-oh, invalid characters in a host name - abandon ship. */
return 1;
}
x++;
}
/* Host names that contained high bits need to be converted to Punycode
in order to resolve properly. */
if (has_utf) {
char* output = 0;
if (idna_to_ascii_8z((char*)host, &output, 0) != IDNA_SUCCESS ||
strlen(output) > MAX_DNS_LEN) {
ck_free(output);
return 1;
}
ck_free(host);
host = (u8*)output;
}
req->host = host;
cur += path_st;
/* All right, moving on: if host name is followed by :, let's try to
parse and validate port number; otherwise, assume 80 / 443, depending
on protocol. */
if (*cur == ':') {
u32 digit_cnt = strspn((char*)++cur, "0123456789");
u32 port = atoi((char*)cur);
if (!digit_cnt || (cur[digit_cnt] && !strchr("/?#", cur[digit_cnt])))
return 1;
req->port = port;
cur += digit_cnt;
} else {
if (req->proto == PROTO_HTTPS) req->port = 443; else req->port = 80;
}
has_host = 1;
} else {
/* No host name found - copy from referring request instead. */
if (!ref || !ref->host) return 1;
req->host = ck_strdup(ref->host);
req->addr = ref->addr;
req->port = ref->port;
}
if (!*cur || *cur == '#') {
u32 i;
/* No-op path. If the URL does not specify host (e.g., #foo), copy
everything from referring request, call it a day. Otherwise
(e.g., http://example.com#foo), let tokenize_path() run to
add NULL-"" entry to the list. */
if (!has_host) {
for (i=0;i<ref->par.c;i++)
if (PATH_SUBTYPE(ref->par.t[i]) || QUERY_SUBTYPE(ref->par.t[i]))
set_value(ref->par.t[i], ref->par.n[i], ref->par.v[i], -1, &req->par);
return 0;
}
}
if (!has_host && *cur == '?') {
u32 i;
/* URL begins with ? and does not specify host (e.g., ?foo=bar). Copy all
path segments, but no query, then fall through to parse the query
string. */
for (i=0;i<ref->par.c;i++)
if (PATH_SUBTYPE(ref->par.t[i]))
set_value(ref->par.t[i], ref->par.n[i], ref->par.v[i], -1, &req->par);
/* In this case, we do not want tokenize_path() to tinker with the path
in any way. */
add_slash = 0;
} else if (!has_host && *cur != '/') {
/* The URL does not begin with / or ?, and does not specify host (e.g.,
foo/bar?baz). Copy path from referrer, but drop the last "proper"
path segment and everything that follows it. This mimicks browser
behavior (for URLs ending with /, it just drops the final NULL-""
pair). */
u32 i;
u32 path_cnt = 0, path_cur = 0;
for (i=0;i<ref->par.c;i++)
if (ref->par.t[i] == PARAM_PATH) path_cnt++;
for (i=0;i<ref->par.c;i++) {
if (ref->par.t[i] == PARAM_PATH) path_cur++;
if (path_cur < path_cnt && PATH_SUBTYPE(ref->par.t[i]))
set_value(ref->par.t[i], ref->par.n[i], ref->par.v[i], -1, &req->par);
}
}
/* Tokenize the remaining path on top of what we parsed / copied over. */
tokenize_path(cur, req, add_slash);
return 0;
}
/* URL-decodes a string. 'Plus' parameter governs the behavior on +
signs (as they have a special meaning only in query params, not in path). */
u8* url_decode_token(u8* str, u32 len, u8 plus) {
u8 *ret = ck_alloc(len + 1);
u8 *src = str, *dst = ret;
char *hex_str = "0123456789abcdef";
while (len--) {
u8 c = *(src++);
char *f, *s;
if (plus && c == '+') c = ' ';
if (c == '%' && len >= 2 &&
(f = strchr(hex_str, tolower(src[0]))) &&
(s = strchr(hex_str, tolower(src[1])))) {
c = ((f - hex_str) << 4) | (s - hex_str);
src += 2; len -= 2;
}
/* We can't handle NUL-terminators gracefully when deserializing request
parameters, because param_array values are NUL-terminated themselves.
Let's encode \0 as \xFF instead, and hope nobody notices. */
if (!c) c = 0xff;
*(dst++) = c;
}
*(dst++) = 0;
ret = ck_realloc(ret, dst - ret);
return ret;
}
/* URL-encodes a string according to custom rules. The assumption here is that
the data is already tokenized as "special" boundaries such as ?, =, &, /,
;, !, $, and , so these characters must always be escaped if present in
tokens. We otherwise let pretty much everything else go through, as it
may help with the exploitation of certain vulnerabilities. */
u8* url_encode_token(u8* str, u32 len) {
u8 *ret = ck_alloc(len * 3 + 1);
u8 *src = str, *dst = ret;
while (len--) {
u8 c = *(src++);
if (c <= 0x20 || c >= 0x80 || strchr("#%&=/+;,!$?", c)) {
if (c == 0xFF) c = 0;
sprintf((char*)dst, "%%%02X", c);
dst += 3;
} else *(dst++) = c;
}
*(dst++) = 0;
ret = ck_realloc(ret, dst - ret);
return ret;
}
/* Split path at known "special" character boundaries, URL decode values,
then put them in the provided http_request struct. */
void tokenize_path(u8* str, struct http_request* req, u8 add_slash) {
u8* cur;
u8 know_dir = 0;
while (*str == '/') str++;
cur = str;
/* Parse path elements first. */
while (*cur && !strchr("?#", *cur)) {
u32 next_seg, next_eq;
u8 *name = NULL, *value = NULL;
u8 first_el = (str == cur);
if (first_el || *cur == '/') {
/* Optimize out //, /\0, /./, and /.\0. They do indicate
we are looking at a directory, so mark this. */
if (!first_el && (cur[1] == '/' || !cur[1])) {
cur++;
know_dir = 1;
continue;
}
if (cur[0 + !first_el] == '.' && (cur[1 + !first_el] == '/' ||
!cur[1 + !first_el])) {
cur += 1 + !first_el;
know_dir = 1;
continue;
}
/* If we encountered /../ or /..\0, remove everything up to and
including the last "true" path element. It's also indicative
of a directory, by the way. */
if (cur[0 + !first_el] == '.' && cur[1 + !first_el] == '.' &&
(cur[2 + !first_el] == '/' || !cur[2 + !first_el])) {
u32 i, last_p = req->par.c;
for (i=0;i<req->par.c;i++)
if (req->par.t[i] == PARAM_PATH) last_p = i;
for (i=last_p;i<req->par.c;i++) {
req->par.t[i] = PARAM_NONE;
}
cur += 2 + !first_el;
know_dir = 1;
continue;
}
}
/* If we're here, we have an actual item to add; cur points to
the string if it's the first element, or to field separator
if one of the subsequent ones. */
next_seg = strcspn((char*)cur + 1, "/;,!$?#") + 1,
next_eq = strcspn((char*)cur + 1, "=/;,!$?#") + 1;
know_dir = 0;
if (next_eq < next_seg) {
name = url_decode_token(cur + !first_el, next_eq - !first_el, 0);
value = url_decode_token(cur + next_eq + 1, next_seg - next_eq - 1, 0);
} else {
value = url_decode_token(cur + !first_el, next_seg - !first_el, 0);
}
switch (first_el ? '/' : *cur) {
case ';': set_value(PARAM_PATH_S, name, value, -1, &req->par); break;
case ',': set_value(PARAM_PATH_C, name, value, -1, &req->par); break;
case '!': set_value(PARAM_PATH_E, name, value, -1, &req->par); break;
case '$': set_value(PARAM_PATH_D, name, value, -1, &req->par); break;
default: set_value(PARAM_PATH, name, value, -1, &req->par);
}
ck_free(name);
ck_free(value);
cur += next_seg;
}
/* If the last segment was /, /./, or /../, *or* if we never added
anything to the path to begin with, we want to store a NULL-""
entry to denote it's a directory. */
if (know_dir || (add_slash && (!*str || strchr("?#", *str))))
set_value(PARAM_PATH, NULL, (u8*)"", -1, &req->par);
/* Deal with regular query parameters now. This is much simpler,
obviously. */
while (*cur && !strchr("#", *cur)) {
u32 next_seg = strcspn((char*)cur + 1, "#&;,!$") + 1;
u32 next_eq = strcspn((char*)cur + 1, "=#&;,!$") + 1;
u8 *name = NULL, *value = NULL;
/* foo=bar syntax... */
if (next_eq < next_seg) {
name = url_decode_token(cur + 1, next_eq - 1, 1);
value = url_decode_token(cur + next_eq + 1, next_seg - next_eq - 1, 1);
} else {
value = url_decode_token(cur + 1, next_seg - 1, 1);
}
switch (*cur) {
case ';': set_value(PARAM_QUERY_S, name, value, -1, &req->par); break;
case ',': set_value(PARAM_QUERY_C, name, value, -1, &req->par); break;
case '!': set_value(PARAM_QUERY_E, name, value, -1, &req->par); break;
case '$': set_value(PARAM_QUERY_D, name, value, -1, &req->par); break;
default: set_value(PARAM_QUERY, name, value, -1, &req->par);
}
ck_free(name);
ck_free(value);
cur += next_seg;
}
}
/* Reconstructs URI from http_request data. Includes protocol and host
if with_host is non-zero. */
u8* serialize_path(struct http_request* req, u8 with_host, u8 with_post) {
u32 i, cur_pos;
u8 got_search = 0;
u8* ret;
NEW_STR(ret, cur_pos);
#define ASD(_p3) ADD_STR_DATA(ret, cur_pos, _p3)
/* For human-readable uses... */
if (with_host) {
ASD("http");
if (req->proto == PROTO_HTTPS) ASD("s");
ASD("://");
ASD(req->host);
if ((req->proto == PROTO_HTTP && req->port != 80) ||
(req->proto == PROTO_HTTPS && req->port != 443)) {
u8 port[7];
sprintf((char*)port, ":%u", req->port);
ASD(port);
}
}
/* First print path... */
for (i=0;i<req->par.c;i++)
if (PATH_SUBTYPE(req->par.t[i])) {
switch (req->par.t[i]) {
case PARAM_PATH_S: ASD(";"); break;
case PARAM_PATH_C: ASD(","); break;
case PARAM_PATH_E: ASD("!"); break;
case PARAM_PATH_D: ASD("$"); break;
default: ASD("/");
}
if (req->par.n[i]) {
u32 len = strlen((char*)req->par.n[i]);
u8* str = url_encode_token(req->par.n[i], len);
ASD(str); ASD("=");
ck_free(str);
}
if (req->par.v[i]) {
u32 len = strlen((char*)req->par.v[i]);
u8* str = url_encode_token(req->par.v[i], len);
ASD(str);
ck_free(str);
}
}
/* Then actual parameters. */
for (i=0;i<req->par.c;i++)
if (QUERY_SUBTYPE(req->par.t[i])) {
if (!got_search) {
ASD("?");
got_search = 1;
} else switch (req->par.t[i]) {
case PARAM_QUERY_S: ASD(";"); break;
case PARAM_QUERY_C: ASD(","); break;
case PARAM_QUERY_E: ASD("!"); break;
case PARAM_QUERY_D: ASD("$"); break;
default: ASD("&");
}
if (req->par.n[i]) {
u32 len = strlen((char*)req->par.n[i]);
u8* str = url_encode_token(req->par.n[i], len);
ASD(str); ASD("=");
ck_free(str);
}
if (req->par.v[i]) {
u32 len = strlen((char*)req->par.v[i]);
u8* str = url_encode_token(req->par.v[i], len);
ASD(str);
ck_free(str);
}
}
got_search = 0;
if (with_post)
for (i=0;i<req->par.c;i++)
if (POST_SUBTYPE(req->par.t[i])) {
if (!got_search) {
ASD(" POST: ");
got_search = 1;
} else ASD("&");
if (req->par.n[i]) {
u32 len = strlen((char*)req->par.n[i]);
u8* str = url_encode_token(req->par.n[i], len);
ASD(str); ASD("=");
ck_free(str);
}
if (req->par.v[i]) {
u32 len = strlen((char*)req->par.v[i]);
u8* str = url_encode_token(req->par.v[i], len);
ASD(str);
ck_free(str);
}
}
#undef ASD
TRIM_STR(ret, cur_pos);
return ret;
}
/* Looks up IP for a particular host, returns data in network order.
Uses standard resolver, so it is slow and blocking, but we only
expect to call it a couple of times during a typical assessment.
There are some good async DNS libraries to consider in the long run. */
u32 maybe_lookup_host(u8* name) {
struct hostent* h;
struct dns_entry *d = dns, *prev = NULL;
u32 ret_addr = 0;
struct in_addr in;
/* Don't bother resolving raw IP addresses, naturally. */
if (inet_aton((char*)name, &in))
return (u32)in.s_addr;
while (d) {
if (!strcasecmp((char*)name, (char*)d->name)) return d->addr;
prev = d;
d = d->next;
}
h = gethostbyname((char*)name);
/* If lookup fails with a transient error, be nice - try again. */
if (!h && h_errno == TRY_AGAIN) h = gethostbyname((char*)name);
if (h) {
u32 i = 0;
/* For each address associated with the host, see if we have any
other hosts that resolved to that same IP. If yes, return
that address; otherwise, just return first. This is for HTTP
performance and bookkeeping reasons. */
while (h->h_addr_list[i]) {
d = dns;
while (d) {
if (d->addr == *(u32*)h->h_addr_list[i]) {
ret_addr = d->addr;
goto dns_got_name;
}
d = d->next;
}
i++;
}
ret_addr = *(u32*)h->h_addr_list[0];
}
dns_got_name:
if (!prev) d = dns = ck_alloc(sizeof(struct dns_entry));
else d = prev->next = ck_alloc(sizeof(struct dns_entry));
d->name = ck_strdup(name);
d->addr = ret_addr;
return ret_addr;
}
/* Creates an ad hoc DNS cache entry, to override NS lookups. */
void fake_host(u8* name, u32 addr) {
struct dns_entry *d = dns, *prev = 0;
while (d && d->next) { prev = d ; d = d->next;}
if (!dns) d = dns = ck_alloc(sizeof(struct dns_entry));
else d = prev->next = ck_alloc(sizeof(struct dns_entry));
d->name = ck_strdup(name);
d->addr = addr;
}
/* Prepares a serialized HTTP buffer to be sent over the network. */
u8* build_request_data(struct http_request* req) {
u8 *ret_buf, *ck_buf, *pay_buf, *path;
u32 ret_pos, ck_pos, pay_pos, i;
u8 req_type = PARAM_NONE;
if (req->proto == PROTO_NONE)
FATAL("uninitialized http_request");
NEW_STR(ret_buf, ret_pos);
path = serialize_path(req, 0, 0);
#define ASD(_p3) ADD_STR_DATA(ret_buf, ret_pos, _p3)
if (req->method) ASD(req->method); else ASD((u8*)"GET");
ASD(" ");
ASD(path);
ASD(" HTTP/1.1\r\n");
ck_free(path);
ASD("Host: ");
ASD(req->host);
if ((req->proto == PROTO_HTTP && req->port != 80) ||
(req->proto == PROTO_HTTPS && req->port != 443)) {
char port[7];
sprintf((char*)port, ":%u", req->port);
ASD(port);
}
ASD("\r\n");
/* Insert generic browser headers first. */
if (browser_type == BROWSER_FAST) {
ASD("Accept-Encoding: gzip\r\n");
ASD("Connection: keep-alive\r\n");
ASD("User-Agent: Mozilla/5.0 SF/" VERSION "\r\n");
/* Some servers will reject to gzip responses unless "Mozilla/..."
is seen in User-Agent. Bleh. */
} else if (browser_type == BROWSER_FFOX) {
if (!GET_HDR((u8*)"User-Agent", &req->par))
ASD("User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; "
"rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 SF/" VERSION "\r\n");
ASD("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;"
"q=0.8\r\n");
if (!GET_HDR((u8*)"Accept-Language", &req->par))
ASD("Accept-Language: en-us,en\r\n");
ASD("Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\n");
ASD("Keep-Alive: 300\r\n");
ASD("Connction: keep-alive\r\n");
} else /* MSIE */ {
ASD("Accept: */*\r\n");
if (!GET_HDR((u8*)"Accept-Language", &req->par))
ASD("Accept-Language: en,en-US;q=0.5\r\n");
if (!GET_HDR((u8*)"User-Agent", &req->par))
ASD("User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; "
"Trident/4.0; .NET CLR 1.1.4322; InfoPath.1; .NET CLR "
"2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; SF/"
VERSION ")\r\n");
ASD("Accept-Encoding: gzip, deflate\r\n");
ASD("Connection: Keep-Alive\r\n");
}
/* Request a limited range up front to minimize unwanted traffic. */
if (size_limit) {
u8 limit[32];
sprintf((char*)limit, "Range: bytes=0-%u\r\n", size_limit - 1);
ASD(limit);
}
/* Include a dummy "Referer" header, to avoid certain XSRF checks. */
if (!GET_HDR((u8*)"Referer", &req->par)) {
ASD("Referer: http");
if (req->proto == PROTO_HTTPS) ASD("s");
ASD("://");
ASD(req->host);
ASD("/\r\n");
}
/* Take care of HTTP authentication next. */
if (auth_type == AUTH_BASIC) {
u8* lp = ck_alloc(strlen((char*)auth_user) + strlen((char*)auth_pass) + 2);
u8* lpb64;
sprintf((char*)lp, "%s:%s", auth_user, auth_pass);
lpb64 = b64_encode(lp, strlen((char*)lp));
ASD("Authorization: basic ");
ASD(lpb64);
ASD("\r\n");
ck_free(lpb64);
ck_free(lp);
}
/* Append any other requested headers and cookies. */
NEW_STR(ck_buf, ck_pos);
for (i=0;i<req->par.c;i++) {
if (req->par.t[i] == PARAM_HEADER) {
ASD(req->par.n[i]);
ASD(": ");
ASD(req->par.v[i]);
ASD("\r\n");
} else if (req->par.t[i] == PARAM_COOKIE) {
if (ck_pos) ADD_STR_DATA(ck_buf, ck_pos, ";");
ADD_STR_DATA(ck_buf, ck_pos, req->par.n[i]);
ADD_STR_DATA(ck_buf, ck_pos, "=");
ADD_STR_DATA(ck_buf, ck_pos, req->par.v[i]);
}
}
/* Also include extra globals, if any (but avoid dupes). */
for (i=0;i<global_http_par.c;i++) {
if (global_http_par.t[i] == PARAM_HEADER &&
!GET_HDR(global_http_par.n[i], &req->par)) {
ASD(global_http_par.n[i]);
ASD(": ");
ASD(global_http_par.v[i]);
ASD("\r\n");
} else if (global_http_par.t[i] == PARAM_COOKIE &&
!GET_CK(global_http_par.n[i], &req->par)) {
if (ck_pos) ADD_STR_DATA(ck_buf, ck_pos, ";");
ADD_STR_DATA(ck_buf, ck_pos, global_http_par.n[i]);
ADD_STR_DATA(ck_buf, ck_pos, "=");
ADD_STR_DATA(ck_buf, ck_pos, global_http_par.v[i]);
}
}
if (ck_pos) {
ASD("Cookie: ");
ASD(ck_buf);
ASD("\r\n");
}
ck_free(ck_buf);
/* Now, let's serialize the payload, if necessary. */
for (i=0;i<req->par.c;i++) {
switch (req->par.t[i]) {
case PARAM_POST_F:
case PARAM_POST_O:
req_type = req->par.t[i];
break;
case PARAM_POST:
if (req_type == PARAM_NONE) req_type = PARAM_POST;
break;
}
}
NEW_STR(pay_buf, pay_pos);
if (req_type == PARAM_POST) {
/* The default case: application/x-www-form-urlencoded. */
for (i=0;i<req->par.c;i++)
if (req->par.t[i] == PARAM_POST) {
if (pay_pos) ADD_STR_DATA(pay_buf, pay_pos, "&");
if (req->par.n[i]) {
u32 len = strlen((char*)req->par.n[i]);
u8* str = url_encode_token(req->par.n[i], len);
ADD_STR_DATA(pay_buf, pay_pos, str);
ADD_STR_DATA(pay_buf, pay_pos, "=");
ck_free(str);
}
if (req->par.v[i]) {
u32 len = strlen((char*)req->par.v[i]);
u8* str = url_encode_token(req->par.v[i], len);
ADD_STR_DATA(pay_buf, pay_pos, str);
ck_free(str);
}
}
ASD("Content-Type: application/x-www-form-urlencoded\r\n");
} else if (req_type == PARAM_POST_O) {
/* Opaque, non-escaped data of some sort. */
for (i=0;i<req->par.c;i++)
if (req->par.t[i] == PARAM_POST_O && req->par.v[i])
ADD_STR_DATA(pay_buf, pay_pos, req->par.v[i]);
ASD("Content-Type: text/plain\r\n");
} else if (req_type == PARAM_POST_F) {
u8 bound[20];
/* MIME envelopes: multipart/form-data */
sprintf((char*)bound, "sf%u", R(1000000));
for (i=0;i<req->par.c;i++)
if (req->par.t[i] == PARAM_POST || req->par.t[i] == PARAM_POST_F) {
ADD_STR_DATA(pay_buf, pay_pos, "--");
ADD_STR_DATA(pay_buf, pay_pos, bound);
ADD_STR_DATA(pay_buf, pay_pos, "\r\n"
"Content-Disposition: form-data; name=\"");
if (req->par.n[i])
ADD_STR_DATA(pay_buf, pay_pos, req->par.n[i]);
if (req->par.t[i] == PARAM_POST_F) {
u8 tmp[64];
sprintf((char*)tmp, "\"; filename=\"sfish%u." DUMMY_EXT "\"\r\n"
"Content-Type: " DUMMY_MIME "\r\n\r\n", R(16));
ADD_STR_DATA(pay_buf, pay_pos, tmp);
ADD_STR_DATA(pay_buf, pay_pos, new_xss_tag((u8*)DUMMY_FILE));
register_xss_tag(req);
} else {
ADD_STR_DATA(pay_buf, pay_pos, "\"\r\n\r\n");
if (req->par.v[i])
ADD_STR_DATA(pay_buf, pay_pos, req->par.v[i]);
}
ADD_STR_DATA(pay_buf, pay_pos, "\r\n");
}
ADD_STR_DATA(pay_buf, pay_pos, "--");
ADD_STR_DATA(pay_buf, pay_pos, bound);
ADD_STR_DATA(pay_buf, pay_pos, "--\r\n");
ASD("Content-Type: multipart/form-data; boundary=");
ASD(bound);
ASD("\r\n");
} else if (req_type == 0) ASD("\r\n");
/* Finalize HTTP payload... */
for (i=0;i<pay_pos;i++)
if (pay_buf[i] == 0xff) pay_buf[i] = 0x00;
TRIM_STR(pay_buf, pay_pos);
if (pay_pos) {
u8 cl[40];
sprintf((char*)cl, "Content-Length: %u\r\n\r\n", pay_pos);
ASD(cl);
ASD(pay_buf);
}
ck_free(pay_buf);
#undef ASD
/* Phew! */
TRIM_STR(ret_buf, ret_pos);
return ret_buf;
}
/* Internal helper for parsing lines for parse_response(), etc. */
static u8* grab_line(u8* data, u32* cur_pos, u32 data_len) {
u8 *cur_ptr = data + *cur_pos,
*start_ptr = cur_ptr,
*end_ptr = data + data_len,
*ret;
if (start_ptr == end_ptr) return 0;
while (cur_ptr < end_ptr && *cur_ptr != '\n') cur_ptr++;
if (cur_ptr != end_ptr) cur_ptr++;
*cur_pos += cur_ptr - start_ptr;
while (cur_ptr > start_ptr && strchr("\r\n", *(cur_ptr-1))) cur_ptr--;
ret = ck_alloc(cur_ptr - start_ptr + 1);
memcpy(ret, start_ptr, cur_ptr - start_ptr);
ret[cur_ptr - start_ptr] = 0;
return ret;
}
/* Builds response fingerprint data. These fingerprints are used to
find "roughly comparable" pages based on their word length
distributions (divided into FP_SIZE buckets). */
void fprint_response(struct http_response* res) {
u32 i, c_len = 0, in_space = 0;
res->sig.code = res->code;
for (i=0;i<res->pay_len;i++)
if (res->payload[i] <= 0x20 || strchr("<>'\"", res->payload[i])) {
if (!in_space) {
in_space = 1;
if (c_len <= FP_MAX_LEN)
res->sig.data[c_len % FP_SIZE]++;
c_len = 0;
} else c_len++;
} else {
if (in_space) {
in_space = 0;
if (c_len <= FP_MAX_LEN)
res->sig.data[c_len % FP_SIZE]++;
c_len = 0;
} else c_len++;
}
res->sig.data[c_len % FP_SIZE]++;
}
/* Parses a network buffer containing raw HTTP response received over the
network ('more' == the socket is still available for reading). Returns 0
if response parses OK, 1 if more data should be read from the socket,
2 if the response seems invalid, 3 if response OK but connection must be
closed. */
u8 parse_response(struct http_request* req, struct http_response* res,
u8* data, u32 data_len, u8 more) {
u8* cur_line = 0;
s32 pay_len = -1;
u32 cur_data_off = 0,
total_chunk = 0,
http_ver;
u8 chunked = 0, compressed = 0, must_close = 0;
if (res->code)
FATAL("struct http_response reused! Original code '%u'.", res->code);
#define NEXT_LINE() do { \
if (cur_line) ck_free(cur_line); \
cur_line = grab_line(data, &cur_data_off, data_len); \
} while (0)
/* First, let's do a superficial request completeness check. Be
prepared for a premature end at any point. */
NEXT_LINE(); /* HTTP/1.x xxx ... */
if (!cur_line) return more ? 1 : 2;
if (strlen((char*)cur_line) < 7 && more) {
ck_free(cur_line);
return 1;
}
if (strncmp((char*)cur_line, "HTTP/1.", 7)) {
ck_free(cur_line);
return 2;
}
/* Scan headers for Content-Length, Transfer-Encoding, etc. */
while (1) {
NEXT_LINE(); /* Next header or empty line. */
/* If headers end prematurely, and more data might arrive, ask for
it; otherwise, just assume end of headers and continue. */
if (!cur_line) {
if (more) return 1;
res->warn |= WARN_PARTIAL;
break;
}
/* Empty line indicates the beginning of a payload. */
if (!cur_line[0]) break;
if (!strncasecmp((char*)cur_line, "Content-Length:", 15)) {
/* The value in Content-Length header would be useful for seeing if we
have all the requested data already. Reject invalid values to avoid
integer overflows, etc, though. */
if (sscanf((char*)cur_line + 15, "%d", &pay_len) == 1) {
if (pay_len < 0 || pay_len > 1000000000 /* 1 GB */) {
ck_free(cur_line);
return 2;
}
} else pay_len = -1;
} else if (!strncasecmp((char*)cur_line, "Transfer-Encoding:", 18)) {
/* Transfer-Encoding: chunked must be accounted for to properly
determine if we received all the data when Content-Length not found. */
u8* x = cur_line + 18;
while (isspace(*x)) x++;
if (!strcasecmp((char*)x, "chunked")) chunked = 1;
} else if (!strncasecmp((char*)cur_line, "Content-Encoding:", 17)) {
/* Content-Encoding is good to know, too. */
u8* x = cur_line + 17;
while (isspace(*x)) x++;
if (!strcasecmp((char*)x, "deflate") || !strcasecmp((char*)x, "gzip"))
compressed = 1;
} else if (!strncasecmp((char*)cur_line, "Connection:", 11)) {
u8* x = cur_line + 11;
while (isspace(*x)) x++;
if (!strcasecmp((char*)x, "close")) must_close = 1;
}
}
/* We are now at the beginning of the payload. Firstly, how about decoding
'chunked' to see if we received a complete 0-byte terminator chunk
already? */
if (chunked) {
while (1) {
u32 chunk_len;
NEXT_LINE(); /* Should be chunk size, hex. */
if (!cur_line || sscanf((char*)cur_line, "%x", &chunk_len) != 1) {
if (more) { ck_free(cur_line); return 1; }
res->warn |= WARN_PARTIAL;
break;
}
if (chunk_len > 1000000000 || total_chunk > 1000000000 /* 1 GB */) {
ck_free(cur_line);
return 2;
}
/* See if we actually enough buffer to skip the chunk. Bail out if
not and more data might be coming; otherwise, adjust chunk size
accordingly. */
if (cur_data_off + chunk_len > data_len) {
if (more) { ck_free(cur_line); return 1; }
chunk_len = data_len - cur_data_off;
total_chunk += chunk_len;
res->warn |= WARN_PARTIAL;
break;
}
total_chunk += chunk_len;
cur_data_off += chunk_len;
NEXT_LINE();
/* No newline? */
if (!cur_line) {
if (more) return 1;
res->warn |= WARN_PARTIAL;
}
/* All right, so that was the last, complete 0-size chunk?
Exit the loop if so. */
if (!chunk_len) break;
}
if (cur_data_off != data_len) res->warn |= WARN_TRAIL;
} else if (pay_len == -1 && more) {
/* If in a mode other than 'chunked', and C-L not received, but more
data might be available - try to request it. */
ck_free(cur_line);
return 1;
} else if (pay_len != 1) {
if (cur_data_off + pay_len > data_len) {
/* If C-L seen, but not nough data in the buffer, try to request more
if possible, otherwise tag the response as partial. */
if (more) { ck_free(cur_line); return 1; }
res->warn |= WARN_PARTIAL;
} else if (cur_data_off + pay_len < data_len) res->warn |= WARN_TRAIL;
}
/* Rewind, then properly parse HTTP headers, parsing cookies. */
cur_data_off = 0;
NEXT_LINE();
if (strlen((char*)cur_line) < 13 ||
sscanf((char*)cur_line, "HTTP/1.%u %u ", &http_ver, &res->code) != 2 ||
res->code < 100 || res->code > 999) {
ck_free(cur_line);
return 2;
}
/* Some servers, when presented with 'Range' header, will return 200 on
some queries for a particular resource, and 206 on other queries (e.g.,
with query string), despite returning exactly as much data. As an
ugly workaround... */
if (res->code == 206) res->code = 200;
if (http_ver == 0) must_close = 1;
res->msg = ck_strdup(cur_line + 13);
while (1) {
u8* val;
NEXT_LINE(); /* Next header or empty line. */
if (!cur_line) return 2;
if (!cur_line[0]) break;
/* Split field name and value */
val = (u8*) strchr((char*)cur_line, ':');
if (!val) { ck_free(cur_line); return 2; }
*val = 0;
while (isspace(*(++val)));
if (!strcasecmp((char*)cur_line, "Set-Cookie") ||
!strcasecmp((char*)cur_line, "Set-Cookie2")) {
/* We could bother with a proper tokenizer here, but contrary to "teh
standards", browsers generally don't accept multiple cookies in
Set-Cookie headers, handle quoted-string encoding inconsistently,
etc. So let's just grab the first value naively and move on. */
u8* cval;
u8* orig_val;
cval = (u8*) strchr((char*)val, ';');
if (cval) *cval = 0;
cval = (u8*) strchr((char*)val, '=');
if (cval) { *cval = 0; cval++; }
/* If proper value not found, use NULL name and put whatever was
found in the value field. */
if (!cval) { cval = val; val = 0; }
SET_CK(val, cval, &res->hdr);
if (val) {
/* New or drastically changed cookies are noteworthy. */
orig_val = GET_CK(val, &global_http_par);
if (!orig_val || (strlen((char*)orig_val) != strlen((char*)cval) &&
strncmp((char*)cval, (char*)orig_val, 3))) {
res->cookies_set = 1;
problem(PROB_NEW_COOKIE, req, res, val, req->pivot, 0);
}
/* Set cookie globally, but ignore obvious attempts to delete
existing ones. */
if (!ignore_cookies && cval[0])
SET_CK(val, cval, &global_http_par);
}
} else SET_HDR(cur_line, val, &res->hdr);
/* Content-Type is worth mining for MIME, charset data at this point. */
if (!strcasecmp((char*)cur_line, "Content-Type")) {
if (res->header_mime) {
/* Duplicate Content-Type. Fetch previous value, if different,
complain. */
u8* tmp = GET_HDR((u8*)"Content-Type", &res->hdr);
if (strcasecmp((char*)tmp, (char*)val)) res->warn |= WARN_CFL_HDR;
} else {
u8 *tmp = (u8*)strchr((char*)val, ';'), *cset;
if (tmp) {
*tmp = 0;
if ((cset = (u8*)strchr((char*)tmp + 1, '=')))
res->header_charset = ck_strdup(cset + 1);
}
res->header_mime = ck_strdup(val);
if (tmp) *tmp = ';';
}
}
}
/* At the beginning of the payload again! */
if (!chunked) {
/* Identity. Ignore actual C-L data, use just as much as we collected. */
res->pay_len = data_len - cur_data_off;
res->payload = ck_alloc(res->pay_len + 1);
res->payload[res->pay_len] = 0; /* NUL-terminate for safer parsing. */
memcpy(res->payload, data + cur_data_off, res->pay_len);
} else {
u32 chunk_off = 0;
/* Chunked - we should have the authoritative length of chunk
contents in total_chunk already, and the overall structure
validated, so let's just reparse quickly. */
res->pay_len = total_chunk;
res->payload = ck_alloc(total_chunk + 1);
res->payload[res->pay_len] = 0;
while (1) {
u32 chunk_len;
NEXT_LINE();
if (!cur_line || sscanf((char*)cur_line, "%x", &chunk_len) != 1) break;
if (cur_data_off + chunk_len > data_len)
chunk_len = data_len - cur_data_off;
memcpy(res->payload + chunk_off, data + cur_data_off, chunk_len);
chunk_off += chunk_len;
cur_data_off += chunk_len;
NEXT_LINE();
if (!chunk_len) break;
}
}
ck_free(cur_line);
if (compressed) {
u8* tmp_buf;
/* Deflate or gzip - zlib can handle both the same way. We lazily allocate
a SIZE_LIMIT output buffer, then truncate it if necessary. */
z_stream d;
s32 err;
tmp_buf = ck_alloc(SIZE_LIMIT + 1);
d.zalloc = 0;
d.zfree = 0;
d.opaque = 0;
d.next_in = res->payload;
d.avail_in = res->pay_len;
d.next_out = tmp_buf;
d.avail_out = SIZE_LIMIT;
/* Say hello to third-party vulnerabilities! */
if (inflateInit2(&d, 32 + 15) != Z_OK) {
inflateEnd(&d);
ck_free(tmp_buf);
return 2;
}
err = inflate(&d, Z_FINISH);
inflateEnd(&d);
if (err != Z_BUF_ERROR && err != Z_OK && err != Z_STREAM_END) {
ck_free(tmp_buf);
return 2;
}
ck_free(res->payload);
bytes_deflated += res->pay_len;
res->pay_len = SIZE_LIMIT - d.avail_out;
res->payload = ck_realloc(tmp_buf, res->pay_len + 1);
res->payload[res->pay_len] = 0;
bytes_inflated += res->pay_len;
}
#undef NEXT_LINE
fprint_response(res);
return must_close ? 3 : 0;
}
/* Performs a deep free() of struct http_request */
void destroy_request(struct http_request* req) {
u32 i;
for (i=0;i<req->par.c;i++) {
ck_free(req->par.n[i]);
ck_free(req->par.v[i]);
}
ck_free(req->par.t);
ck_free(req->par.n);
ck_free(req->par.v);
ck_free(req->method);
ck_free(req->host);
ck_free(req->orig_url);
ck_free(req);
}
/* Performs a deep free() of struct http_response */
void destroy_response(struct http_response* res) {
u32 i;
for (i=0;i<res->hdr.c;i++) {
ck_free(res->hdr.n[i]);
ck_free(res->hdr.v[i]);
}
ck_free(res->hdr.t);
ck_free(res->hdr.n);
ck_free(res->hdr.v);
ck_free(res->meta_charset);
ck_free(res->header_charset);
ck_free(res->header_mime);
ck_free(res->msg);
ck_free(res->payload);
ck_free(res);
}
/* Performs a deep free(), unlinking of struct queue_entry, and the
underlying request / response pair. */
static void destroy_unlink_queue(struct queue_entry* q, u8 keep) {
if (!keep) {
if (q->req) destroy_request(q->req);
if (q->res) destroy_response(q->res);
}
if (!q->prev) queue = q->next; else q->prev->next = q->next;
#ifdef QUEUE_FILO
if (!q->next) q_tail = q->prev;
#endif /* QUEUE_FILO */
if (q->next) q->next->prev = q->prev;
ck_free(q);
queue_cur--;
}
/* Performs a deep free(), unlinking, network shutdown for struct
conn_entry, as well as the underlying queue entry, request
and response structs. */
static void destroy_unlink_conn(struct conn_entry* c, u8 keep) {
if (c->q) destroy_unlink_queue(c->q, keep);
if (!c->prev) conn = c->next; else c->prev->next = c->next;
if (c->next) c->next->prev = c->prev;
if (c->srv_ssl) SSL_free(c->srv_ssl);
if (c->srv_ctx) SSL_CTX_free(c->srv_ctx);
ck_free(c->write_buf);
ck_free(c->read_buf);
close(c->fd);
ck_free(c);
conn_cur--;
}
/* Performs struct conn_entry for reuse following a clean shutdown. */
static void reuse_conn(struct conn_entry* c, u8 keep) {
if (c->q) destroy_unlink_queue(c->q, keep);
c->q = 0;
ck_free(c->read_buf);
ck_free(c->write_buf);
c->read_buf = c->write_buf = NULL;
c->read_len = c->write_len = c->write_off = 0;
c->SSL_rd_w_wr = c->SSL_wr_w_rd = 0;
}
/* Schedules a new asynchronous request (does not make a copy of the
original http_request struct, may deallocate it immediately or
later on); req->callback() will be invoked when the request is
completed (or fails - maybe right away). */
void async_request(struct http_request* req) {
struct queue_entry *qe;
struct http_response *res;
if (req->proto == PROTO_NONE || !req->callback)
FATAL("uninitialized http_request");
res = ck_alloc(sizeof(struct http_response));
req->addr = maybe_lookup_host(req->host);
/* Don't try to issue extra requests if max_fail
consecutive failures exceeded; but still try to
wrap up the (partial) scan. */
if (req_errors_cur > max_fail) {
DEBUG("!!! Too many subsequent request failures!\n");
res->state = STATE_SUPPRESS;
if (!req->callback(req, res)) {
destroy_request(req);
destroy_response(res);
}
req_dropped++;
return;
}
/* DNS errors mean instant fail. */
if (!req->addr) {
DEBUG("!!! DNS error!\n");
res->state = STATE_DNSERR;
if (!req->callback(req, res)) {
destroy_request(req);
destroy_response(res);
}
req_errors_net++;
conn_count++;
conn_failed++;
return;
}
/* Enforce user limits. */
if (req_count > max_requests) {
DEBUG("!!! Total request limit exceeded!\n");
res->state = STATE_SUPPRESS;
if (!req->callback(req, res)) {
destroy_request(req);
destroy_response(res);
}
req_dropped++;
return;
}
/* OK, looks like we're good to go. Insert the request
into the the queue. */
#ifdef QUEUE_FILO
qe = q_tail;
q_tail = ck_alloc(sizeof(struct queue_entry));
q_tail->req = req;
q_tail->res = res;
q_tail->prev = qe;
if (q_tail->prev) q_tail->prev->next = q_tail;
if (!queue) queue = q_tail;
#else
qe = queue;
queue = ck_alloc(sizeof(struct queue_entry));
queue->req = req;
queue->res = res;
queue->next = qe;
if (queue->next) queue->next->prev = queue;
#endif /* ^QUEUE_FILO */
queue_cur++;
req_count++;
}
/* Check SSL properties, raise security alerts if necessary. We do not perform
a very thorough validation - we do not check for valid root CAs, bad ciphers,
SSLv2 support, etc - as these are covered well by network-level security
assessment tools anyway.
We might eventually want to check aliases or support TLS SNI. */
static void check_ssl(struct conn_entry* c) {
X509 *p;
p = SSL_get_peer_certificate(c->srv_ssl);
if (p) {
u32 cur_time = time(0);
char *issuer, *host, *req_host;
/* Check for certificate expiration... */
if (ASN1_UTCTIME_cmp_time_t(p->cert_info->validity->notBefore, cur_time)
!= -1 ||
ASN1_UTCTIME_cmp_time_t(p->cert_info->validity->notAfter, cur_time)
!= 1)
problem(PROB_SSL_CERT_DATE, c->q->req, 0, 0,
host_pivot(c->q->req->pivot), 0);
/* Check for self-signed certs or no issuer data. */
issuer = X509_NAME_oneline(p->cert_info->issuer,NULL,0);
if (!issuer || !p->name || !strcmp(issuer, p->name))
problem(PROB_SSL_SELF_CERT, c->q->req, 0, (u8*)issuer,
host_pivot(c->q->req->pivot), 0);
else
problem(PROB_SSL_CERT, c->q->req, 0, (u8*)issuer,
host_pivot(c->q->req->pivot), 0);
free(issuer);
/* Extract CN= from certificate name, compare to destination host. */
host = strrchr(p->name, '=');
req_host = (char*)c->q->req->host;
if (host) {
host++;
if (host[0] == '*' && host[1] == '.') {
host++;
if (strlen(req_host) > strlen(host))
req_host += strlen(req_host) - strlen(host);
}
}
if (!host || strcasecmp(host, req_host))
problem(PROB_SSL_BAD_HOST, c->q->req, 0, (u8*)host,
host_pivot(c->q->req->pivot), 0);
X509_free(p);
} else problem(PROB_SSL_NO_CERT, c->q->req, 0, 0,
host_pivot(c->q->req->pivot), 0);
c->ssl_checked = 1;
}
/* Associates a queue entry with an existing connection (if 'use_c' is
non-NULL), or creates a new connection to host (if 'use_c' NULL). */
static void conn_associate(struct conn_entry* use_c, struct queue_entry* q) {
struct conn_entry* c;
if (use_c) {
c = use_c;
c->reused = 1;
} else {
struct sockaddr_in sin;
/* OK, we need to create a new connection list entry and connect
it to a target host. */
c = ck_alloc(sizeof(struct conn_entry));
conn_count++;
c->proto = q->req->proto;
c->addr = q->req->addr;
c->port = q->req->port;
c->fd = socket(PF_INET, SOCK_STREAM, 0);
if (c->fd < 0) {
connect_error:
if (c->fd >=0) close(c->fd);
q->res->state = STATE_LOCALERR;
destroy_unlink_queue(q, q->req->callback(q->req, q->res));
req_errors_net++;
req_errors_cur++;
ck_free(c);
conn_failed++;
return;
}
sin.sin_family = PF_INET;
sin.sin_port = htons(c->port);
memcpy(&sin.sin_addr, &q->req->addr, 4);
fcntl(c->fd, F_SETFL, O_NONBLOCK);
if (connect(c->fd, (struct sockaddr*) &sin, sizeof(struct sockaddr_in)) &&
(errno != EINPROGRESS)) goto connect_error;
/* HTTPS also requires SSL state to be initialized at this point. */
if (c->proto == PROTO_HTTPS) {
c->srv_ctx = SSL_CTX_new(SSLv23_client_method());
if (!c->srv_ctx) goto connect_error;
SSL_CTX_set_mode(c->srv_ctx, SSL_MODE_ENABLE_PARTIAL_WRITE |
SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER);
c->srv_ssl = SSL_new(c->srv_ctx);
if (!c->srv_ssl) {
SSL_CTX_free(c->srv_ctx);
goto connect_error;
}
SSL_set_fd(c->srv_ssl, c->fd);
SSL_set_connect_state(c->srv_ssl);
}
/* Make it official. */
c->next = conn;
conn = c;
if (c->next) c->next->prev = c;
conn_cur++;
}
c->q = q;
q->c = c;
q->res->state = STATE_CONNECT;
c->req_start = c->last_rw = time(0);
c->write_buf = build_request_data(q->req);
c->write_len = strlen((char*)c->write_buf);
}
/* Processes the queue. Returns the number of queue entries remaining,
0 if none. Will do a blocking select() to wait for socket state changes
(or timeouts) if no data available to process. This is the main
routine for the scanning loop. */
u32 next_from_queue(void) {
u32 cur_time = time(0);
if (conn_cur) {
static struct pollfd* p;
struct conn_entry* c = conn;
u32 i = 0;
/* First, go through all connections, handle connects, SSL handshakes, data
reads and writes, and exceptions. */
if (p) free(p);
p = __DFL_ck_alloc(sizeof(struct pollfd) * conn_cur);
while (c) {
p[i].fd = c->fd;
p[i].events = POLLIN | POLLERR | POLLHUP;
if (c->write_len - c->write_off || c->SSL_rd_w_wr)
p[i].events |= POLLOUT;
c = c->next;
i++;
}
poll(p, conn_cur, 100);
c = conn;
for (i=0;i<conn_cur;i++) {
struct conn_entry* next = c->next;
/* Connection closed: see if we have any pending data to write. If yes,
fail. If not, try parse_response() to see if we have all the data.
Clean up. */
if (p[i].revents & (POLLERR|POLLHUP)) {
u8 keep;
network_error:
keep = 0;
/* Retry requests that were sent on old keep-alive connections
and failed instantly with no data read; might be just that
the server got bored. */
if (c->q && c->reused && !c->read_len) {
c->q->res->state = STATE_NOTINIT;
c->q->c = 0;
c->q = 0;
req_retried++;
} else if (c->q) {
if (c->write_len - c->write_off || !c->read_len) {
c->q->res->state = STATE_CONNERR;
keep = c->q->req->callback(c->q->req, c->q->res);
req_errors_net++;
req_errors_cur++;
} else {
if (parse_response(c->q->req, c->q->res, c->read_buf,
c->read_len, 0) != 2) {
c->q->res->state = STATE_OK;
keep = c->q->req->callback(c->q->req, c->q->res);
if (req_errors_cur <= max_fail)
req_errors_cur = 0;
} else {
c->q->res->state = STATE_CONNERR;
keep = c->q->req->callback(c->q->req, c->q->res);
req_errors_net++;
req_errors_cur++;
}
}
}
destroy_unlink_conn(c, keep);
} else
/* Incoming data (when SSL_write() did not request a read) or
continuation of SSL_read() possible (if SSL_read() wanted to write).
Process data, call parse_response() to see if w have all we wanted.
Update event timers. */
if (((p[i].revents & POLLIN) && !c->SSL_wr_w_rd) ||
((p[i].revents & POLLOUT) && c->SSL_rd_w_wr)) {
if (c->q) {
s32 read_res;
u8 p_ret;
c->read_buf = ck_realloc(c->read_buf, c->read_len + READ_CHUNK + 1);
if (c->proto == PROTO_HTTPS) {
s32 ssl_err;
c->SSL_rd_w_wr = 0;
read_res = SSL_read(c->srv_ssl, c->read_buf + c->read_len,
READ_CHUNK);
if (!read_res) goto network_error;
if (read_res < 0) {
ssl_err = SSL_get_error(c->srv_ssl, read_res);
if (ssl_err == SSL_ERROR_WANT_WRITE) c->SSL_rd_w_wr = 1;
else if (ssl_err != SSL_ERROR_WANT_READ) goto network_error;
read_res = 0;
}
} else {
read_res = read(c->fd, c->read_buf + c->read_len, READ_CHUNK);
if (read_res <= 0) goto network_error;
}
bytes_recv += read_res;
c->read_len += read_res;
c->read_buf = ck_realloc(c->read_buf, c->read_len + 1);
c->read_buf[c->read_len] = 0; /* NUL-terminate for sanity. */
/* We force final parse_response() if response length exceeded
size_limit by more than 4 kB. The assumption here is that
it is less expensive to redo the connection than it is
to continue receiving an unknown amount of extra data. */
p_ret = parse_response(c->q->req, c->q->res, c->read_buf, c->read_len,
(c->read_len > (size_limit + READ_CHUNK)) ? 0 : 1);
if (!p_ret || p_ret == 3) {
u8 keep;
c->q->res->state = STATE_OK;
keep = c->q->req->callback(c->q->req, c->q->res);
/* If we got all data without hitting the limit, and if
"Connection: close" is not indicated, we might want
to keep the connection for future use. */
if (c->read_len > (size_limit + READ_CHUNK) || p_ret)
destroy_unlink_conn(c, keep); else reuse_conn(c, keep);
if (req_errors_cur <= max_fail)
req_errors_cur = 0;
} else if (p_ret == 2) {
c->q->res->state = STATE_RESPERR;
destroy_unlink_conn(c, c->q->req->callback(c->q->req, c->q->res));
req_errors_http++;
req_errors_cur++;
} else {
c->last_rw = cur_time;
c->q->res->state = STATE_RECEIVE;
}
} else destroy_unlink_conn(c, 0); /* Unsolicited response! */
} else
/* Write possible (if SSL_read() did not request a write), or
continuation of SSL_write() possible (if SSL_write() wanted to
read). Send data, update timers, etc. */
if (((p[i].revents & POLLOUT) && !c->SSL_rd_w_wr) ||
((p[i].revents & POLLIN) && c->SSL_wr_w_rd)) {
if (c->write_len - c->write_off) {
s32 write_res;
if (c->proto == PROTO_HTTPS) {
s32 ssl_err;
c->SSL_wr_w_rd = 0;
write_res = SSL_write(c->srv_ssl, c->write_buf + c->write_off,
c->write_len - c->write_off);
if (!write_res) goto network_error;
if (write_res < 0) {
ssl_err = SSL_get_error(c->srv_ssl, write_res);
if (ssl_err == SSL_ERROR_WANT_READ) c->SSL_wr_w_rd = 1;
else if (ssl_err != SSL_ERROR_WANT_WRITE) goto network_error;
write_res = 0;
} else if (!c->ssl_checked) check_ssl(c);
} else {
write_res = write(c->fd, c->write_buf + c->write_off,
c->write_len - c->write_off);
if (write_res <= 0) goto network_error;
}
bytes_sent += write_res;
c->write_off += write_res;
c->q->res->state = STATE_SEND;
c->last_rw = cur_time;
}
} else
/* Nothing happened. Check timeouts, kill stale connections.
Active (c->q) connections get checked for total and last I/O
timeouts. Non-active connctions must just not exceed
idle_tmout. */
if (!p[i].revents) {
u8 keep = 0;
if ((c->q && (cur_time - c->last_rw > rw_tmout ||
cur_time - c->req_start > resp_tmout)) ||
(!c->q && (cur_time - c->last_rw > idle_tmout)) ||
(!c->q && tear_down_idle)) {
if (c->q) {
c->q->res->state = STATE_CONNERR;
keep = c->q->req->callback(c->q->req, c->q->res);
req_errors_net++;
req_errors_cur++;
conn_busy_tmout++;
} else {
conn_idle_tmout++;
tear_down_idle = 0;
}
destroy_unlink_conn(c, keep);
}
}
c = next;
}
}
/* OK, connection-handling affairs taken care of! Next, let's go through all
queue entries NOT currently associated with a connection, and try to
pair them up with something. */
if (queue_cur) {
struct queue_entry *q = queue;
while (q) {
struct queue_entry* next = q->next;
u32 to_host = 0;
if (!q->c) {
struct conn_entry* c = conn;
/* Let's try to find a matching, idle connection first. */
while (c) {
struct conn_entry* cnext = c->next;
if (c->addr == q->req->addr && (++to_host) &&
c->port == q->req->port &&
c->proto == q->req->proto && !c->q) {
conn_associate(c, q);
goto next_q_entry;
}
c = cnext;
}
/* No match. If we are out of slots, request some other idle
connection to be nuked soon. */
if (to_host < max_conn_host && conn_cur < max_connections) {
conn_associate(0, q);
goto next_q_entry;
} else tear_down_idle = 1;
}
next_q_entry:
q = next;
}
}
return queue_cur;
}
/* Helper function for request / response dumpers: */
static void dump_params(struct param_array* par) {
u32 i;
for (i=0;i<par->c;i++) {
switch (par->t[i]) {
case PARAM_NONE: SAY(" <<<<"); break;
case PARAM_PATH: SAY(" PATH"); break;
case PARAM_PATH_S: SAY(" PT_S"); break;
case PARAM_PATH_C: SAY(" PT_C"); break;
case PARAM_PATH_E: SAY(" PT_E"); break;
case PARAM_PATH_D: SAY(" PT_D"); break;
case PARAM_QUERY: SAY(" QUER"); break;
case PARAM_QUERY_S: SAY(" QR_S"); break;
case PARAM_QUERY_C: SAY(" QR_C"); break;
case PARAM_QUERY_E: SAY(" QR_E"); break;
case PARAM_QUERY_D: SAY(" QR_D"); break;
case PARAM_POST: SAY(" POST"); break;
case PARAM_POST_F: SAY(" FILE"); break;
case PARAM_POST_O: SAY(" OPAQ"); break;
case PARAM_HEADER: SAY(" head"); break;
case PARAM_COOKIE: SAY(" cook"); break;
default: SAY(" ????");
}
SAY(":%-20s = '%s'\n",
par->n[i] ? par->n[i] : (u8*)"-",
par->v[i] ? par->v[i] : (u8*)"-");
}
}
/* Creates a working copy of a request. If all is 0, does not copy
path, query parameters, or POST data (but still copies headers). */
struct http_request* req_copy(struct http_request* req, struct pivot_desc* pv,
u8 all) {
struct http_request* ret;
u32 i;
if (!req) return NULL;
ret = ck_alloc(sizeof(struct http_request));
ret->proto = req->proto;
if (all)
ret->method = ck_strdup(req->method);
else
ret->method = ck_strdup((u8*)"GET");
ret->host = ck_strdup(req->host);
ret->addr = req->addr;
ret->port = req->port;
ret->pivot = pv;
ret->user_val = req->user_val;
/* Copy all the requested data. */
for (i=0;i<req->par.c;i++)
if (all || HEADER_SUBTYPE(req->par.t[i]))
set_value(req->par.t[i], req->par.n[i], req->par.v[i], -1,
&ret->par);
memcpy(&ret->same_sig, &req->same_sig, sizeof(struct http_sig));
return ret;
}
/* Creates a copy of a response. */
struct http_response* res_copy(struct http_response* res) {
struct http_response* ret;
u32 i;
if (!res) return NULL;
ret = ck_alloc(sizeof(struct http_response));
ret->state = res->state;
ret->code = res->code;
ret->msg = res->msg ? ck_strdup(res->msg) : NULL;
ret->warn = res->warn;
for (i=0;i<res->hdr.c;i++)
set_value(res->hdr.t[i], res->hdr.n[i], res->hdr.v[i], -1, &ret->hdr);
ret->pay_len = res->pay_len;
if (res->pay_len) {
ret->payload = ck_alloc(res->pay_len);
memcpy(ret->payload, res->payload, res->pay_len);
}
memcpy(&ret->sig, &res->sig, sizeof(struct http_sig));
ret->sniff_mime_id = res->sniff_mime_id;
ret->decl_mime_id = res->decl_mime_id;
ret->doc_type = res->doc_type;
ret->css_type = res->css_type;
ret->js_type = res->js_type;
ret->json_safe = res->json_safe;
ret->stuff_checked = res->stuff_checked;
ret->scraped = res->scraped;
if (res->meta_charset)
ret->meta_charset = ck_strdup(res->meta_charset);
if (res->header_charset)
ret->header_charset = ck_strdup(res->header_charset);
if (res->header_mime)
ret->header_mime = ck_strdup(res->header_mime);
ret->sniffed_mime = res->sniffed_mime;
return ret;
}
/* Dumps HTTP request data, for diagnostic purposes: */
void dump_http_request(struct http_request* r) {
u8 *new_url, *tmp;
SAY("\n== HTTP REQUEST %p ==\n\nBasic values:\n", r);
SAY(" Proto = %u\n", r->proto);
SAY(" Method = %s\n", r->method ? r->method : (u8*)"(GET)");
SAY(" Host = %s\n", r->host);
SAY(" Addr = %u.%u.%u.%u\n", ((u8*)&r->addr)[0], ((u8*)&r->addr)[1],
((u8*)&r->addr)[2], ((u8*)&r->addr)[3]);
SAY(" Port = %d\n", r->port);
SAY(" Xrefs = pivot %p, handler %p, user %d\n", r->pivot,
r->callback, r->user_val);
new_url = serialize_path(r, 1, 0);
SAY("\nURLs:\n Original = %s\n"
" Synthetic = %s\n", r->orig_url ? r->orig_url : (u8*)"[none]",
new_url);
ck_free(new_url);
SAY("\nParameter array:\n");
dump_params(&r->par);
SAY("\nRaw request data:\n\n");
tmp = build_request_data(r);
SAY("%s\n",tmp);
ck_free(tmp);
SAY("\n== END OF REQUEST ==\n");
}
/* Dumps HTTP response data, likewise: */
void dump_http_response(struct http_response* r) {
SAY("\n== HTTP RESPONSE %p ==\n\nBasic values:\n", r);
SAY(" State = %u\n", r->state);
SAY(" Response = %u ('%s')\n", r->code, r->msg);
SAY(" Flags = %08x\n", r->warn);
SAY(" Data len = %u\n", r->pay_len);
SAY("\nParameter array:\n");
dump_params(&r->hdr);
if (r->payload) SAY("\nPayload data (%u):\n\n%s\n", r->pay_len, r->payload);
SAY("\n== END OF RESPONSE ==\n");
}
/* Destroys http state information, for memory profiling. */
void destroy_http() {
u32 i;
struct dns_entry* cur;
for (i=0;i<global_http_par.c;i++) {
ck_free(global_http_par.n[i]);
ck_free(global_http_par.v[i]);
}
ck_free(global_http_par.t);
ck_free(global_http_par.n);
ck_free(global_http_par.v);
while (conn) destroy_unlink_conn(conn,0 );
while (queue) destroy_unlink_queue(queue,0 );
cur = dns;
while (cur) {
struct dns_entry* next = cur->next;
ck_free(cur->name);
ck_free(cur);
cur = next;
}
}
/* Shows some pretty statistics. */
void http_stats(u64 st_time) {
u64 en_time;
struct timeval tv;
gettimeofday(&tv, NULL);
en_time = tv.tv_sec * 1000 + tv.tv_usec / 1000;
SAY("Scan statistics\n"
"---------------\n\n"
cGRA " Scan time : " cNOR "%u:%02u:%02u.%04u\n"
cGRA " HTTP requests : " cNOR "%u sent (%.02f/s), %.02f kB in, "
"%.02f kB out (%.02f kB/s) \n"
cGRA " Compression : " cNOR "%.02f kB in, %.02f kB out "
"(%.02f%% gain) \n"
cGRA " HTTP exceptions : " cNOR "%u net errors, %u proto errors, "
"%u retried, %u drops\n"
cGRA " TCP connections : " cNOR "%u total (%.02f req/conn) \n"
cGRA " TCP exceptions : " cNOR "%u failures, %u timeouts, %u purged\n"
cGRA " External links : " cNOR "%u skipped\n"
cGRA " Reqs pending : " cNOR "%u \n",
/* hrs */ (u32)((en_time - st_time) / 1000 / 60 / 60),
/* min */ (u32)((en_time - st_time) / 1000 / 60) % 60,
/* sec */ (u32)((en_time - st_time) / 1000) % 60,
/* ms */ (u32)((en_time - st_time) % 1000),
req_count - queue_cur,
(float) (req_count - queue_cur / 1.15) * 1000 / (en_time - st_time + 1),
(float) bytes_recv / 1024, (float) bytes_sent / 1024,
(float) (bytes_recv + bytes_sent) / 1.024 / (en_time - st_time + 1),
(float) bytes_deflated / 1024, (float) bytes_inflated / 1024,
((float) bytes_inflated - bytes_deflated) / (bytes_inflated +
bytes_deflated + 1) * 100,
req_errors_net, req_errors_http, req_retried, req_dropped,
conn_count, (float) req_count / conn_count,
conn_failed, conn_busy_tmout, conn_idle_tmout,
url_scope, queue_cur);
}