skipfish/analysis.c

2423 lines
66 KiB
C

/*
skipfish - content analysis
---------------------------
Author: Michal Zalewski <lcamtuf@google.com>
Copyright 2009, 2010 by Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#define _VIA_ANALYSIS_C
#include "debug.h"
#include "config.h"
#include "types.h"
#include "http_client.h"
#include "database.h"
#include "crawler.h"
#include "analysis.h"
u8 no_parse, /* Disable HTML link detection */
warn_mixed, /* Warn on mixed content */
log_ext_urls, /* Log all external URLs */
no_forms, /* Do not submit forms */
relaxed_mime, /* Relax about cset / mime */
pedantic_cache; /* Match HTTP/1.0 and HTTP/1.1 */
/* Form autofill hints: */
static u8** addl_form_name;
static u8** addl_form_value;
static u32 addl_form_cnt;
/* Runs some rudimentary checks on top-level pivot HTTP responses. */
void pivot_header_checks(struct http_request* req,
struct http_response* res) {
u32 i;
u8 *par_hdr, *cur_hdr;
DEBUG_CALLBACK(req, res);
/* Server: change. */
cur_hdr = GET_HDR((u8*)"Server", &res->hdr);
if (!RPAR(req)->res) par_hdr = NULL;
else par_hdr = GET_HDR((u8*)"Server", &RPAR(req)->res->hdr);
if (!cur_hdr) cur_hdr = (u8*)"[none]";
if (!par_hdr) par_hdr = (u8*)"[none]";
if (strcmp((char*)cur_hdr, (char*)par_hdr))
problem(PROB_SERVER_CHANGE, req, res, cur_hdr, req->pivot, 0);
/* Via: appears or disappears. */
cur_hdr = GET_HDR((u8*)"Via", &res->hdr);
if (!RPAR(req)->res) par_hdr = NULL;
else par_hdr = GET_HDR((u8*)"Via", &RPAR(req)->res->hdr);
if (cur_hdr != par_hdr)
problem(PROB_VIA_CHANGE, req, res, cur_hdr ? cur_hdr : (u8*)"[none]",
req->pivot, 0);
/* New X-* header appears. */
for (i=0;i<res->hdr.c;i++) {
if (strncasecmp((char*)res->hdr.n[i], "X-", 2)) continue;
if (!RPAR(req)->res) par_hdr = NULL;
else par_hdr = GET_HDR(res->hdr.n[i], &RPAR(req)->res->hdr);
if (!par_hdr)
problem(PROB_X_CHANGE, req, res, res->hdr.n[i], req->pivot,0);
}
/* Old X-* header disappears. */
if (RPAR(req)->res)
for (i=0;i<RPAR(req)->res->hdr.c;i++) {
if (strncasecmp((char*)RPAR(req)->res->hdr.n[i], "X-", 2)) continue;
cur_hdr = GET_HDR(RPAR(req)->res->hdr.n[i], &res->hdr);
if (!cur_hdr)
problem(PROB_X_CHANGE, req, res, RPAR(req)->res->hdr.n[i], req->pivot, 0);
}
}
/* Helper for scrape_response(). Tries to add a previously extracted link,
also checks for cross-site and mixed content issues and similar woes.
Subres is: 1 - redirect; 2 - IMG; 3 - IFRAME, EMBED, OBJECT, APPLET;
4 - SCRIPT, LINK REL=STYLESHEET; 0 - everything else. */
static void test_add_link(u8* str, struct http_request* ref,
struct http_response* res, u8 subres, u8 sure) {
struct http_request* n;
DEBUG_CALLBACK(ref,res);
DEBUG("* Alleged URL = '%s' [%u]\n", str, subres);
/* Don't add injected links. */
if (!strncasecmp((char*)str, "skipfish:", 10) ||
!strncasecmp((char*)str, "//skipfish.invalid/", 20) ||
!strncasecmp((char*)str, "http://skipfish.invalid/", 25)) return;
/* Don't add links that look like they came from JS code with fragmented HTML
snippets, etc. */
if (!sure && (strchr("()\"' +,^", *str) ||
(*str == '/' && strchr("()\"' +,^", str[1])))) return;
if ((str[0] == '\'' || str[0] == '"') && (str[1] == '+' || str[1] == ' '))
return;
if (!strncasecmp((char*)str, "mailto:", 7)) {
if (log_ext_urls) {
u8* qmark = (u8*)strchr((char*)str, '?');
if (qmark) *qmark = 0;
problem(PROB_MAIL_ADDR, ref, res, str + 7, host_pivot(ref->pivot),0);
if (qmark) *qmark = '?';
}
return;
}
n = ck_alloc(sizeof(struct http_request));
n->pivot = ref->pivot;
if (!parse_url(str, n, ref)) {
if (R(100) < crawl_prob) maybe_add_pivot(n, NULL, sure ? 2 : 1);
/* Link to a third-party site? */
if (!url_allowed_host(n) && !url_trusted_host(n))
switch (subres) {
case 0:
if (log_ext_urls)
problem(PROB_EXT_LINK, ref, res, str, host_pivot(ref->pivot), 0);
break;
case 1:
if (log_ext_urls)
problem(PROB_EXT_REDIR, ref, res, str, ref->pivot, 0);
break;
case 2:
case 3:
problem(PROB_EXT_OBJ, ref, res, str, ref->pivot, 0);
break;
case 4:
problem(PROB_EXT_SUB, ref, res, str, ref->pivot, 0);
break;
}
/* Mixed content? We don't care about <IMG> or redirectors
here, though. */
if (ref->proto == PROTO_HTTPS && n->proto == PROTO_HTTP &&
subres > 2 && warn_mixed)
problem((subres == 4) ? PROB_MIXED_SUB : PROB_MIXED_OBJ,
ref, res, str, ref->pivot, 0);
} else if (!ref->proto) {
/* Parser didn't recognize the protocol. If it's a
hierarchical URL (foo://), log it. */
u8* x = str;
while (isalnum(*x)) x++;
if (str != x && *x == ':' && x[1] == '/')
problem(PROB_UNKNOWN_PROTO, ref, res, str, ref->pivot, 0);
}
destroy_request(n);
}
/* Another scrape_response() helper - decodes HTML escaping,
maybe also JS escaping, from URLs. Returns a dynamically
allocated copy. */
static u8* html_decode_param(u8* url, u8 also_js) {
u32 len = strlen((char*)url);
u8* ret = ck_alloc(len + 1);
u32 i, pos = 0;
/* If directed to do so, decode \x, \u, and \char sequences
first. */
if (also_js) {
for (i=0;i<len;i++) {
if (url[i] == '\\') {
u32 act_val = 0;
if (url[i+1] == 'x') {
sscanf((char*)url + i + 2, "%2x", &act_val);
i += 3;
} else if (url[i+1] == 'u') {
sscanf((char*)url + i + 2, "%4x", &act_val);
i += 5;
} else {
act_val = url[i+1];
i += 1;
}
if (!act_val || act_val > 0xff) act_val = '?';
ret[pos++] = act_val;
} else ret[pos++] = url[i];
}
ret[pos] = 0;
url = ret;
len = pos;
pos = 0;
ret = ck_alloc(len + 1);
}
/* Next, do old-school HTML decoding. There are many other named
entities, of course, but the odds of them appearing in URLs
without %-encoding are negligible. */
for (i=0;i<len;i++) {
if (url[i] == '&') {
if (!strncasecmp((char*)url + i + 1, "amp;", 4)) {
ret[pos++] = '&';
i += 4;
continue;
} else if (!strncasecmp((char*)url + i + 1, "quot;", 5)) {
ret[pos++] = '\'';
i += 5;
continue;
} else if (!strncasecmp((char*)url + i + 1, "lt;", 3)) {
ret[pos++] = '<';
i += 3;
continue;
} else if (!strncasecmp((char*)url + i + 1, "gt;", 3)) {
ret[pos++] = '>';
i += 3;
continue;
} else if (url[i+1] == '#') {
u32 act_val = 0;
u8 semicol = 0;
if (url[i+2] == 'x')
sscanf((char*)url + i + 3, "%x%c", &act_val, &semicol);
else sscanf((char*)url + i + 2, "%u%c", &act_val, &semicol);
if (semicol == ';') {
if (!act_val || act_val > 0xff) act_val = '?';
ret[pos++] = act_val;
i += strcspn((char*)url + i, ";");
continue;
}
/* Fall through and output the sequence as-is. */
}
} else if (url[i] == '\r' || url[i] == '\n') continue;
ret[pos++] = url[i];
}
ret[pos] = 0;
if (also_js) ck_free(url);
return ret;
}
/* Macro to test for tag names */
#define ISTAG(_val, _tag) \
(!strncasecmp((char*)(_val), _tag, strlen((char*)_tag)) && \
isspace(_val[strlen((char*)_tag)]))
/* Macro to find and move past parameter name (saves result in
_store, NULL if not found). Buffer needs to be NUL-terminated
at nearest >. */
#define FIND_AND_MOVE(_store, _val, _param) { \
(_store) = inl_strcasestr((u8*)_val, (u8*)_param); \
if (_store) { \
if (!isspace((_store)[-1])) (_store) = NULL; \
else (_store) += strlen((char*)_param); \
} \
} while (0)
/* Macro to extract parameter value, handling quotes. */
#define EXTRACT_ALLOC_VAL(_store, _val) do { \
u32 _val_len; \
if (*(_val) == '\'') _val_len = strcspn((char*)++(_val), "'"); else \
if (*(_val) == '"') _val_len = strcspn((char*)++(_val), "\""); else \
_val_len = strcspn((char*)(_val), "> \t\r\n"); \
(_store) = ck_memdup((_val), (_val_len) + 1); \
(_store)[(_val_len)] = 0; \
} while (0)
/* Adds a new item to the form hint system. */
void add_form_hint(u8* name, u8* value) {
addl_form_name = ck_realloc(addl_form_name,
(addl_form_cnt + 1) * sizeof(u8*));
addl_form_value = ck_realloc(addl_form_value,
(addl_form_cnt + 1) * sizeof(u8*));
addl_form_name[addl_form_cnt] = name;
addl_form_value[addl_form_cnt] = value;
addl_form_cnt++;
}
/* Helper for collect_form_data() - comes up with a fitting value for
a checkbox. Returns a static buffer. */
static u8* make_up_form_value(u8* name, struct http_request* req,
struct http_response* res) {
u32 i;
for (i=0;i<addl_form_cnt;i++)
if (inl_strcasestr(name, addl_form_name[i]))
return addl_form_value[i];
i = 0;
while (form_suggestion[i][0]) {
if (inl_strcasestr(name, (u8*)form_suggestion[i][0]))
return (u8*)form_suggestion[i][1];
i++;
}
/* Let's hint we have no clue what to do. */
problem(PROB_UNKNOWN_FIELD, req, res, name, host_pivot(req->pivot), 0);
return (u8*)form_suggestion[i][1];
}
/* Helper for collect_form_data() - checks for a probable anti-XSRF token
values. */
static u8 maybe_xsrf(u8* token) {
u8* tmp;
u32 digit_cnt = 0, upper_cnt = 0, slash_cnt = 0;;
static u8 tm_prefix[8];
if (!tm_prefix[0])
sprintf((char*)tm_prefix, "%lu", time(0) / 100000);
/* Unix time is not a valid token. */
if (!strncasecmp((char*)token, (char*)tm_prefix, strlen((char*)tm_prefix)))
return 0;
tmp = token;
while (*tmp && (isdigit(*tmp) || strchr("abcdef", tolower(*tmp)))) {
if (isdigit(*tmp)) digit_cnt++;
tmp++;
}
/* Looks like base 10 or 16... */
if (!*tmp) {
u32 len = tmp - token;
if (len >= XSRF_B16_MIN && len <= XSRF_B16_MAX && digit_cnt >= XSRF_B16_NUM)
return 1;
return 0;
}
digit_cnt = 0;
tmp = token;
while (*tmp && (isalnum(*tmp) || strchr("=+/", *tmp))) {
if (isdigit(*tmp)) digit_cnt++;
if (isupper(*tmp)) upper_cnt++;
if (*tmp == '/') slash_cnt++;
tmp++;
}
/* Looks like base 32 or 64... */
if (!*tmp) {
u32 len = tmp - token;
if (len >= XSRF_B64_MIN && len <= XSRF_B64_MAX && ((digit_cnt >=
XSRF_B64_NUM && upper_cnt >= XSRF_B64_CASE) || digit_cnt >=
XSRF_B64_NUM2) && slash_cnt <= XSRF_B64_SLASH) return 1;
return 0;
}
/* Looks like... not a numerical token at all. */
return 0;
}
/* Another helper for scrape_response(): examines all <input> tags
up until </form>, then adds them as parameters to current request. */
static void collect_form_data(struct http_request* req,
struct http_request* orig_req,
struct http_response* orig_res,
u8* cur_str, u8 is_post) {
u8 has_xsrf = 0, pass_form = 0;
u32 tag_cnt = 0;
DEBUG("* collect_form_data() entered\n");
do {
u8* tag_end;
if (*cur_str == '<' && (tag_end = (u8*)strchr((char*)cur_str + 1, '>'))) {
cur_str++;
*tag_end = 0;
if (!strncasecmp((char*)cur_str, "/form", 5)) {
*tag_end = '>';
goto final_checks;
}
if (ISTAG(cur_str, "input") || ISTAG(cur_str, "textarea") ||
ISTAG(cur_str, "select")) {
u8 *tag_name, *tag_value, *tag_type, *clean_name = NULL,
*clean_value = NULL;
FIND_AND_MOVE(tag_name, cur_str, "name=");
FIND_AND_MOVE(tag_value, cur_str, "value=");
FIND_AND_MOVE(tag_type, cur_str, "type=");
if (!tag_name) goto next_tag;
EXTRACT_ALLOC_VAL(tag_name, tag_name);
clean_name = html_decode_param(tag_name, 0);
ck_free(tag_name);
tag_name = 0;
if (tag_value) {
EXTRACT_ALLOC_VAL(tag_value, tag_value);
clean_value = html_decode_param(tag_value, 0);
ck_free(tag_value);
tag_value = 0;
}
if (tag_type)
EXTRACT_ALLOC_VAL(tag_type, tag_type);
else tag_type = ck_strdup((u8*)"text");
tag_cnt++;
if (!strcasecmp((char*)tag_type, "file")) {
if (!is_post) {
ck_free(req->method);
req->method = ck_strdup((u8*)"POST");
is_post = 1;
}
set_value(PARAM_POST_F, clean_name, clean_value ?
clean_value : (u8*)"", 0, &req->par);
} else if (!strcasecmp((char*)tag_type, "reset")) {
/* Do nothing - do not store. */
tag_cnt--;
} else if (!strcasecmp((char*)tag_type, "button") ||
!strcasecmp((char*)tag_type, "submit")) {
set_value(is_post ? PARAM_POST : PARAM_QUERY, clean_name,
clean_value ? clean_value : (u8*)"", 0, &req->par);
} else if (!strcasecmp((char*)tag_type, "checkbox")) {
/* Turn checkboxes on. */
set_value(is_post ? PARAM_POST : PARAM_QUERY, clean_name,
(u8*)"on", 0, &req->par);
} else {
u8* use_value = clean_value;
/* Don't second-guess hidden fields. */
if (strcasecmp((char*)tag_type, "hidden") &&
(!use_value || !use_value[0])) {
use_value = make_up_form_value(clean_name, orig_req, orig_res);
} else {
if (!use_value) use_value = (u8*)"";
}
/* Radio buttons are rolled back into a single parameter
because we always replace offset 0 for given clean_name. */
set_value(is_post ? PARAM_POST : PARAM_QUERY,
clean_name, use_value, 0, &req->par);
if (!strcasecmp((char*)tag_type, "hidden") &&
maybe_xsrf(use_value)) has_xsrf = 1;
}
if (inl_strcasestr(tag_name, (u8*) "passw")) pass_form = 1;
ck_free(tag_name);
ck_free(tag_type);
ck_free(tag_value);
ck_free(clean_name);
ck_free(clean_value);
}
next_tag:
*tag_end = '>';
} else tag_end = cur_str;
/* Skip to next tag. */
cur_str = (u8*)strchr((char*)tag_end + 1, '<');
} while (cur_str);
final_checks:
if (pass_form) {
problem(PROB_PASS_FORM, req, orig_res, NULL, req->pivot, 0);
} else {
if (tag_cnt && !has_xsrf)
problem(PROB_VULN_FORM, req, orig_res, NULL, req->pivot, 0);
else
problem(PROB_FORM, req, orig_res, NULL, req->pivot, 0);
}
}
/* Helper for scrape_response() and content_checks: is the
file mostly ASCII? */
static u8 is_mostly_ascii(struct http_response* res) {
u32 i, total, printable = 0;
if (res->doc_type) return (res->doc_type == 2);
total = (res->pay_len > 128) ? 128 : res->pay_len;
if (!total) { res->doc_type = 2; return 1; }
for (i=0;i<total;i++)
if ((res->payload[i] >= 0x20 && res->payload[i] <= 0x7f)
|| (res->payload[i] && strchr("\r\n", res->payload[i])))
printable++;
if (printable * 100 / total < 90) {
DEBUG("* looks like binary data (print = %u, total = %u)\n",
printable, total);
res->doc_type = 1;
return 1;
}
DEBUG("* looks like text file (print = %u, total = %u)\n",
printable, total);
res->doc_type = 2;
return 1;
}
/* Analyzes response headers (Location, etc), body to extract new links,
keyword guesses. This code is designed to be simple and fast, but it
does not even try to understand the intricacies of HTML or whatever
the response might be wrapped in. */
void scrape_response(struct http_request* req, struct http_response* res) {
struct http_request *base = NULL;
u8* cur_str;
u32 i;
DEBUG_CALLBACK(req, res);
if (no_parse || res->scraped) return;
res->scraped = 1;
/* Handle Location, Refresh headers first. */
if ((cur_str = GET_HDR((u8*)"Location", &res->hdr)))
test_add_link(cur_str, req, res, 1, 1);
if ((cur_str = GET_HDR((u8*)"Refresh", &res->hdr)) &&
(cur_str = (u8*)strchr((char*)cur_str, '=')))
test_add_link(cur_str + 1, req, res, 1, 1);
if (!res->payload || !is_mostly_ascii(res)) return;
cur_str = res->payload;
/* PASS 1: Do a simplified check to what looks like proper,
known HTML parameters bearing URLs. Note that payload is
conveniently NUL-terminated. */
do {
u8 *tag_end;
if (*cur_str == '<' && (tag_end = (u8*)strchr((char*)cur_str + 1, '>'))) {
u32 link_type = 0;
u8 set_base = 0, parse_form = 0;
u8 *dirty_url = NULL, *clean_url = NULL, *meta_url = NULL;
cur_str++;
*tag_end = 0;
/* Several tags we need to handle specially, either because they
denote a particularly interesting content type (marked in
link_type, see test_add_link()), or because they use a
non-standard parameter for URL data. */
if (ISTAG(cur_str, "meta")) {
link_type = 1;
FIND_AND_MOVE(dirty_url, cur_str, "content=");
if (dirty_url) {
EXTRACT_ALLOC_VAL(meta_url, dirty_url);
dirty_url = inl_strcasestr(meta_url, (u8*)"URL=");
if (dirty_url) dirty_url += 4;
}
} else if (ISTAG(cur_str, "img")) {
link_type = 2;
FIND_AND_MOVE(dirty_url, cur_str, "src=");
} else if (ISTAG(cur_str, "object") || ISTAG(cur_str, "embed") ||
ISTAG(cur_str, "applet") || ISTAG(cur_str, "iframe")) {
link_type = 3;
FIND_AND_MOVE(dirty_url, cur_str, "src=");
if (!dirty_url) FIND_AND_MOVE(dirty_url, cur_str, "codebase=");
} else if (ISTAG(cur_str, "param") && inl_strcasestr(cur_str,
(u8*)"movie")) {
link_type = 3;
FIND_AND_MOVE(dirty_url, cur_str, "value=");
} else if (ISTAG(cur_str, "script")) {
link_type = 4;
FIND_AND_MOVE(dirty_url, cur_str, "src=");
} else if (ISTAG(cur_str, "link") && inl_strcasestr(cur_str,
(u8*)"stylesheet")) {
link_type = 4;
FIND_AND_MOVE(dirty_url, cur_str, "href=");
} else if (ISTAG(cur_str, "base")) {
set_base = 1;
FIND_AND_MOVE(dirty_url, cur_str, "href=");
} else if (ISTAG(cur_str, "form")) {
u8* method;
parse_form = 1;
FIND_AND_MOVE(dirty_url, cur_str, "action=");
/* See if we need to POST this form or not. */
FIND_AND_MOVE(method, cur_str, "method=");
if (method && *method) {
if (strchr("\"'", *method)) method++;
if (tolower(method[0]) == 'p') parse_form = 2;
}
} else {
/* All other tags - other <link> types, <a>, <bgsound> -
are handled in a generic way. */
FIND_AND_MOVE(dirty_url, cur_str, "href=");
if (!dirty_url) FIND_AND_MOVE(dirty_url, cur_str, "src=");
}
/* If we found no URL to speak of, we're done. */
if (!dirty_url) {
ck_free(meta_url);
goto next_tag;
}
/* De-quotify and decode the value. */
EXTRACT_ALLOC_VAL(dirty_url, dirty_url);
clean_url = html_decode_param(dirty_url, 0);
ck_free(dirty_url);
ck_free(meta_url);
if (!*clean_url) goto next_tag;
test_add_link(clean_url, base ? base : req, res, link_type, 1);
/* If we are dealing with a <base> tag, we need to create
a new dummy request to use as a referrer. */
if (set_base) {
struct http_request* n = ck_alloc(sizeof(struct http_request));
n->pivot = req->pivot;
if (!parse_url(clean_url, n, base ? base : req)) base = n;
} else if (parse_form) {
/* <form> handling... */
struct http_request* n = ck_alloc(sizeof(struct http_request));
n->pivot = req->pivot;
if (parse_form == 2) {
ck_free(n->method);
n->method = ck_strdup((u8*)"POST");
}
/* Don't collect form fields, etc, if target is not within the
scope anyway. */
DEBUG("* Found form: target %s method %s\n", clean_url, n->method);
if (!parse_url(clean_url, n, base ? base : req) && url_allowed(n) &&
R(100) < crawl_prob && !no_forms) {
collect_form_data(n, req, res, tag_end + 1, (parse_form == 2));
maybe_add_pivot(n, NULL, 2);
}
destroy_request(n);
}
next_tag:
*tag_end = '>';
if (clean_url) ck_free(clean_url);
} else tag_end = cur_str;
/* Skip to next tag. */
cur_str = (u8*)strchr((char*)tag_end + 1, '<');
} while (cur_str);
cur_str = res->payload;
/* PASS 2: Extract links from non-HTML body, JS, etc; add keywords. */
do {
u32 clean_len, alpha_cnt = 0, lower_cnt = 0, lead = 0, seg_len;
u8 *ext, *token, *clean_url, *tmp, *pos_at;
u8 last = 0, saved;
/* Skip leading whitespaces, terminators. */
seg_len = strspn((char*)cur_str, " \t\r\n<>\"'");
cur_str += seg_len;
/* If there's a = character preceeded only by alnums or underscores,
skip this chunk (to handle something=http://www.example.com/ neatly) */
tmp = cur_str;
while (*tmp && (isalnum(*tmp) || *tmp == '_')) tmp++;
if (*tmp == '=') cur_str = tmp + 1;
if (!*cur_str) break;
seg_len = strcspn((char*)cur_str + 1, " \t\r\n<>\"'") + 1;
/* Extract the segment, decoding JS and HTML on the go. */
saved = cur_str[seg_len];
cur_str[seg_len] = 0;
clean_url = html_decode_param(cur_str, 1);
cur_str[seg_len] = saved;
tmp = clean_url;
/* We want the entire extracted segment to consist only of nice
characters we would expect in a URL. If not, panic. */
while (*tmp) {
if (!isalnum(*tmp) && !isspace(*tmp) &&
!strchr("_-.:@/?&=#%;$!+~()[]{}\\|^*", *tmp)) goto url_done;
tmp++;
}
clean_len = tmp - clean_url;
/* Strip trailing characters that are unlikely to appear in valid URLs
anyway, and could be a part of some message. */
while (clean_len &&
strchr(".,:?!-$&", clean_url[clean_len-1])) clean_len--;
clean_url[clean_len] = 0;
/* URL CHECK 1: Things that start with ./ or ../ are obviously URLs.
We do not make assumptins about syntax such as /foo/, though, as
it could very well be a regex in a JS block. */
if (!strncmp((char*)clean_url, "./", 2) || !strncmp((char*)clean_url,
"../", 3)) {
add_link:
test_add_link(clean_url, base ? base : req, res, 0, 0);
goto url_done;
}
/* URL CHECK 2: Things that start with <alnum>://<str> are quite
clearly URLs. */
while (clean_url[lead] && (isalnum(clean_url[lead]))) lead++;
if (lead && !strncmp((char*)clean_url + lead, "://", 3) &&
clean_url[lead + 3]) goto add_link;
/* URL CHECK 3: If the result ends with <str>.<known_ext>,
and contains a slash anywhere, assume URL (without that
slash check, we would get duped by 'domain.com'. */
if (strchr((char*)clean_url, '/')) {
i = 0;
while ((ext = wordlist_get_extension(i++))) {
u32 ext_len = strlen((char*)ext);
if (clean_len > ext_len + 2 &&
!strncasecmp((char*)clean_url + clean_len - ext_len,
(char*)ext, ext_len) &&
clean_url[clean_len - ext_len - 1] == '.') goto add_link;
}
}
if (!(pos_at = (u8*)strchr((char*)clean_url, '@'))) {
/* URL CHECK 4: ?<str>=<str> syntax is strongly indicative of
an URL (only if not e-mail). */
u8 *pos_qmark = (u8*)strchr((char*)clean_url, '?'),
*pos_eq = (u8*)strchr((char*)clean_url, '='),
*pos_amp = (u8*)strchr((char*)clean_url, '&');
if (pos_qmark && pos_eq && pos_qmark + 1 < pos_eq &&
pos_eq[1] && (!pos_amp || pos_amp > pos_eq) &&
pos_eq[1] != '=' && !strchr((char*)clean_url, '(') &&
!strchr((char*)clean_url, '[') &&
(u8*)strchr((char*)clean_url, ':') < pos_eq)
goto add_link;
} else if (log_ext_urls) {
/* EMAIL CHECK: If the string uses a limited set of characters,
starts with alpha, ahs at least one period after @, and both
@ and the period are immediately followed by alpha - assume
e-mail. */
u8 *pos_dot,
*pos_qmark = (u8*)strchr((char*)clean_url, '?');
if (pos_qmark && pos_qmark > pos_at) *pos_qmark = 0;
lead = 0;
while (clean_url[lead] && (isalnum(clean_url[lead]) ||
strchr("._-+@", clean_url[lead]))) lead++;
pos_dot = (u8*)strchr((char*)pos_at + 1, '.');
if (!clean_url[lead] && pos_at && pos_dot && isalpha(clean_url[0]) &&
isalpha(pos_at[1]) && isalpha(pos_dot[1])) {
problem(PROB_MAIL_ADDR, req, res, clean_url, host_pivot(req->pivot), 0);
goto url_done;
}
}
/* LAST CHANCE: Try to detect base64; if the segment does not look like
base64, add each segment to try_list. */
tmp = clean_url;
while (*tmp) {
if (isalpha(*tmp)) {
alpha_cnt++;
if (islower(*tmp)) lower_cnt++;
}
tmp++;
}
if (alpha_cnt > 20 && (lower_cnt * 100 / alpha_cnt) > 35 &&
(lower_cnt * 100 / alpha_cnt) < 65) goto url_done;
token = clean_url;
do {
while (*token && !isalnum(*token)) token++;
tmp = token;
while (*tmp && isalnum(*tmp)) tmp++;
if (!*tmp) last = 1;
*tmp = 0;
if (R(100) < GUESS_PROB) wordlist_add_guess(token);
token = tmp + 1;
} while (!last);
url_done:
ck_free(clean_url);
cur_str += seg_len;
} while (*cur_str);
if (base) destroy_request(base);
/* Phew! */
}
/* Returns 1 if document looks like standalone CSS. */
static u8 is_css(struct http_response* res) {
u8* text = res->payload;
u8 first = 0, last = 0;
if (res->css_type) return (res->css_type == 2);
if (!text || !is_mostly_ascii(res)) return 0;
do {
/* Skip whitespaces... */
while (isspace(*text)) text++;
/* Skip HTML, CSS comments. */
if (!strncmp((char*)text, "<!--", 4)) {
text += 4;
continue;
}
if (*text == '/') {
u8 *end;
if (text[1] == '/') {
end = text + strcspn((char*)text, "\r\n");
} else if (text[1] == '*') {
end = (u8*)strstr((char*)text + 2, "*/");
if (end) end += 2;
} else {
res->css_type = 1;
return 0;
}
text = end;
continue;
}
/* @import or @charset is a clear indicator of CSS. */
if (*text == '@' && (!strncasecmp((char*)text + 1, "import", 6) ||
!strncasecmp((char*)text + 1, "charset", 7))) {
res->css_type = 2;
return 1;
}
/* { preceeded with at least one character conforming to the charset
permitted for CSS selectors, and nothing else, is proof enough. Note
that we need to handle all the CSS features, e.g.:
foo, .bar { ... }
* { ... }
foo#bar { ... }
foo[bar~="baz"] { ... }
foo > bar { ... }
Joy. */
if (*text == '{') {
/* Last non-whitespace before { must conform to a smaller subset that
does not include =, etc. */
if (!first || !last ||
(!isalnum(last) && !strchr("-_]*", last))) {
res->css_type = 1;
return 0;
} else {
res->css_type = 2;
return 1;
}
}
if (first) {
/* Subsequent non-whitespaces can enjoy the whole range of funny
characters. */
if (!isalnum(*text) && !strchr(":,.#_-*[]~=\"'>", *text)) {
res->css_type = 1;
return 0;
}
} else {
/* First non-whitespace must conform to a narrow set. */
if (!isalnum(*text) && !strchr(".#_-*", *text)) {
res->css_type = 1;
return 0;
}
first = 1;
}
last = *(text++);
} while (text && *text);
/* Reached end without hitting { or @? Not CSS then. */
res->css_type = 1;
return 0;
}
/* Returns 1 if document looks like JS / JSON. Note that this makes sense
only after we ruled CSS via is_css() call. */
static u8 is_javascript(struct http_response* res) {
u8* text = res->payload;
u8 first = 0, i = 0;
if (res->js_type) return (res->js_type == 2);
if (!text || !is_mostly_ascii(res) || is_css(res)) return 0;
do {
/* Skip HTML, JS comments. Special case for MOTW. */
if (!strncmp((char*)text, "<!--", 4)) {
text += 4;
if (!strncmp((char*)text, " saved from url=", 16)) {
res->js_type = 1;
return 0;
}
continue;
}
if (*text == '/') {
u8 *end;
if (text[1] == '/') {
end = text + strcspn((char*)text, "\r\n");
} else if (text[1] == '*') {
end = (u8*)strstr((char*)text + 2, "*/");
if (end) end += 2;
} else {
res->js_type = 1;
return 0;
}
text = end;
continue;
}
/* Known XSSI-busting prefixes imply JavaScript. */
if (!first)
while (json_safe[i]) {
if (!strncasecmp((char*)text, json_safe[i], strlen(json_safe[i]))) {
res->js_type = 2;
res->json_safe = 1;
return 1;
}
i++;
}
/* Common syntax element that seem convincingly close to JS. */
if (strchr("({[\"'", *text) || (first && strchr("=;", *text))) {
res->js_type = 2;
return 1;
}
/* Ignore legal identifiers. */
if (!isalnum(*text) && !strchr(" \t\r\n_.", *text)) {
res->js_type = 1;
return 0;
}
first = 1;
text++;
} while (*text);
res->js_type = 1;
return 0;
}
/* Checks for XSS, bad coding practices in JavaScript. */
static void check_js_xss(struct http_request* req, struct http_response* res,
u8* body) {
u8* text = body;
u8 in_quot = 0, prev_space = 1;
u32 tag_id, scan_id;
u8* last_word = body;
if (!text) return;
do {
/* Skip comments. */
if (!in_quot && *text == '/') {
u8 *end;
if (text[1] == '/') {
end = text + strcspn((char*)text, "\r\n");
} else if (text[1] == '*') {
end = (u8*)strstr((char*)text + 2, "*/");
if (end) end += 2;
} else return;
text = end;
continue;
} else
if (*text == '\\') { text += 2; continue; } else
if (!in_quot && (*text == '\'' || *text == '"')) {
in_quot = *text;
/* If prev word is write, innerHTML, href, or open,
and current string starts with //skipfishy thingees,
complain. */
if ((!strncmp((char*)last_word, "innerHTML", 9) ||
!strncmp((char*)last_word, "open", 4) ||
!strncmp((char*)last_word, "url", 3) ||
!strncmp((char*)last_word, "href", 4) ||
!strncmp((char*)last_word, "write", 5)) &&
(!strncasecmp((char*)text + 1,"//skipfish.invalid/", 20) ||
!strncasecmp((char*)text + 1,"http://skipfish.invalid/", 25) ||
!strncasecmp((char*)text + 1,"skipfish:", 10)))
problem(PROB_URL_XSS, req, res,
(u8*)"injected URL in JS/CSS code", req->pivot, 0);
} else if (in_quot && *text == in_quot) in_quot = 0;
else if (!in_quot && !strncasecmp((char*)text, "sfi", 3) &&
sscanf((char*)text, "sfi%06uv%06u", &tag_id, &scan_id) == 2) {
struct http_request* orig = get_xss_request(tag_id, scan_id);
if (orig)
problem(PROB_BODY_XSS, orig, res, (u8*)
"injected syntax in JS/CSS code", req->pivot, 0);
else
problem(PROB_BODY_XSS, req, res, (u8*)
"injected syntax in JS/CSS code (from previous scans)",
req->pivot, 0);
} else if (isspace(*text) || *text == '.') prev_space = 1;
else if (isalnum(*text) && prev_space) {
last_word = text;
prev_space = 0;
}
text++;
} while (text && *text);
}
static void detect_mime(struct http_request*, struct http_response*);
static void check_for_stuff(struct http_request*, struct http_response*);
/* Extracts date from HTTP headers. */
static u64 get_date(u8* str) {
#ifdef LOG_STDERR
u8* orig = str;
#endif /* LOG_STDERR */
u8 got_dow = 0;
s64 month = -1, day = -1, year = -1, hr = -1, min = -1, sec = -1;
next_elem:
if (month != -1 && day != -1 && year != -1 && hr != -1) {
u64 ret = (sec + (min * 100) + (hr * 10000) +
(day * 1000000LL) + (month * 100000000LL) +
(year * 10000000000LL));
DEBUG("* get_date() '%s' => %llu\n", orig, (long long)ret);
return ret;
}
if (!*str) {
DEBUG("* get_date() '%s' => FAIL (1)\n", orig);
return 0;
}
/* Skip spaces, commas, dashes. */
while (*str && strchr(" ,-", *str)) str++;
/* Skip day of week. */
if (!got_dow) {
got_dow = 1;
while (*str && isalpha(*str)) str++;
goto next_elem;
}
/* Check if next element if a month. */
if (month == -1 && isalpha(*str)) {
month = 0;
#define SEL(_a,_b) ((((u32)(_a)) << 8) | (_b))
switch (SEL(str[1], str[2])) {
case SEL('e', 'c'): month++;
case SEL('o', 'v'): month++;
case SEL('c', 't'): month++;
case SEL('e', 'p'): month++;
case SEL('u', 'g'): month++;
case SEL('u', 'l'): month++;
case SEL('u', 'n'): month++;
case SEL('a', 'y'): month++;
case SEL('p', 'r'): month++;
case SEL('a', 'r'): month++;
case SEL('e', 'b'): month++;
case SEL('a', 'n'): month++;
}
while (*str && isalpha(*str)) str++;
goto next_elem;
}
/* Something that starts with a digit should be a day. */
if (day == -1 && isdigit(*str)) {
day = atoi((char*)str);
while (*str && isdigit(*str)) str++;
goto next_elem;
}
/* If we already have a day, the next digit-based thing
might be time or year. Time would have nn:... */
if (hr == -1 && isdigit(*str) && str[2] == ':') {
sscanf((char*)str, "%02llu:%02llu:%02llu", &hr, &min,
&sec);
while (*str && (isdigit(*str) || *str == ':')) str++;
goto next_elem;
}
/* And year wouldn't. */
if (year == -1 && isdigit(*str)) {
year = atoi((char*)str);
if (year < 1000) year += 1900; /* 94 -> 1994, 104 -> 2004, 04 -> 1904 */
if (year < 1970) year += 100; /* 1994 -> 1994, 2004 -> 2004, 1904 -> 2004 */
while (*str && isdigit(*str)) str++;
goto next_elem;
}
DEBUG("* get_date() '%s' => FAIL (2)\n", orig);
return 0;
}
/* Analyzes response headers and body to detect stored XSS, redirection,
401, 500 codes, exception messages, source code, caching issues, etc. */
void content_checks(struct http_request* req, struct http_response* res) {
u8* tmp;
u32 tag_id, scan_id;
u8 high_risk = 0;
DEBUG_CALLBACK(req, res);
/* CHECK 1: Caching header logic. */
if (req->proto == PROTO_HTTP) {
u8 *exp = GET_HDR((u8*)"Expires", &res->hdr),
*dat = GET_HDR((u8*)"Date", &res->hdr),
*prg = GET_HDR((u8*)"Pragma", &res->hdr),
*cc = GET_HDR((u8*)"Cache-Control", &res->hdr),
cacheable = 0;
u8 h10c = 0, h11c = 0;
/* Check implicit cacheability. */
if ((!req->method || !strcmp((char*)req->method, "GET")))
cacheable = 1;
/* Determine HTTP/1.0 caching intent. Handle Expires: -1, etc, gracefully.
Note that 'Expires' without 'Date' may cause problems with Opera,
so we complain about this in pedantic mode. */
if (exp) {
if (!isalpha(exp[0])) {
h10c = 1;
} else if (dat) {
if (get_date(exp) <= get_date(dat)) h10c = 1; else h10c = 2;
} else if (cacheable && pedantic_cache) {
problem(PROB_CACHE_LOW, req, res, (u8*)"'Expires' without 'Date'",
req->pivot, 0);
}
}
/* Check 'Pragma', and complain if there's a conflicting 'Expires'
intent already detected. */
if (prg && strstr((char*)prg, "no-cache")) {
if (h10c == 2)
problem(res->cookies_set ? PROB_CACHE_HI : PROB_CACHE_LOW, req, res,
(u8*)"conflicting 'Expires' and 'Pragma'", req->pivot, 0);
h10c = 1;
}
/* Check HTTP/1.1 intent next. Detect conflicting keywords. */
if (cc) {
if (strstr((char*)cc, "no-cache") || strstr((char*)cc, "no-store") ||
strstr((char*)cc, "private") || strstr((char*)cc, "max-age=0")) {
h11c = 1;
if (strstr((char*)cc, "public"))
problem(res->cookies_set ? PROB_CACHE_HI : PROB_CACHE_LOW, req, res,
(u8*)"conflicting 'Cache-Control' data", req->pivot, 0);
} else h11c = 2;
}
DEBUG("* CACHE: 1.0 intent = %u, 1.1 intent = %u, impl = %u, cookie = %u\n",
h10c, h11c, cacheable, res->cookies_set);
/* Perform complex checks against all the data collected: */
if (res->cookies_set && (h10c == 2 || h11c == 2)) {
/* Explicit public intent on a cookie-setting response. This is
obviously bad. */
problem(PROB_CACHE_HI, req, res, (u8*)
"caching explicitly permitted on a 'Set-Cookie' response",
req->pivot, 0);
} else if (res->cookies_set && !h10c && !h11c && cacheable) {
/* Implicitly cacheable Set-Cookie response with no intent specified.
Likewise, makes us unhappy. */
problem(PROB_CACHE_HI, req, res, (u8*)
"implicitly cacheable 'Set-Cookie' response",
req->pivot, 0);
} else if (h10c && h11c && h10c != h11c) {
/* Explicit, conflicting HTTP/1.1 and HTTP/1.0 intents are likely
a problem for many implementations (although earlier checks
already caught cacheability of sensitive responses). */
problem(PROB_CACHE_LOW, req, res, (u8*)
"conflicting 'Cache-Control' and 'Expires' / 'Pragma'",
req->pivot, 0);
} else if (pedantic_cache && h11c == 1 && !h10c && cacheable) {
/* Legacy HTTP/1.0 proxies may fall back to implicit cacheability
even if HTTP/1.1 intent is specified, but no HTTP/1.0 one.
Complain about this in pedantic mode. */
problem(PROB_CACHE_LOW, req, res, (u8*)
"caching restricted by 'Cache-Control', but not 'Expires'",
req->pivot, 0);
}
}
/* CHECK 2: Log troubling response codes. */
if (res->code == 401)
problem(PROB_AUTH_REQ, req, res, NULL, req->pivot, 0);
else if (res->code >= 500)
problem(PROB_SERV_ERR, req, res, NULL, req->pivot, 0);
if (!res->pay_len) return;
if (!is_mostly_ascii(res)) goto binary_checks;
/* CHECK 3: Parse HTML to detect various XSS issues. We are trying to do
a much better job parsing HTML than in scrape_page(), because we do not
want any false positives.
It is a bit silly to have two separate HTML parsers in the code,
so we should fix it at some point. */
/* Full-body CSS / JS responses should be additionally checked for JS
quoting errors. */
if (is_javascript(res) || is_css(res)) check_js_xss(req, res, res->payload);
/* Responses that do not contain the term "function" are much more likely to
be dynamic JSON than just static scripts. Let's try to highlight these. */
if (is_javascript(res) && !res->json_safe &&
(!req->method || !strcmp((char*)req->method, "GET")) &&
!strstr((char*)res->payload, "function ") &&
!strstr((char*)res->payload, "function("))
problem(PROB_JS_XSSI, req, res, NULL, req->pivot, 0);
tmp = res->payload;
do {
if (*tmp == '<') {
u8* tag_name;
u32 len = strcspn((char*)++tmp, "> \t\r\n"), space_len;
u8 remote_script = 0;
/* Grab tag name. */
tag_name = ck_memdup(tmp, len + 1);
tag_name[len] = 0;
tmp += len;
/* Handle all parameters. */
while (*tmp && *tmp != '>') {
u8* param_name;
u8* clean_val = NULL;
u8* sfi_pos;
/* Shoo, whitespaces. */
space_len = strspn((char*)tmp, " \t\r\n");
tmp += space_len;
/* Grab parameter name. */
len = strcspn((char*)tmp, "=> \t\r\n");
param_name = ck_memdup(tmp, len + 1);
param_name[len] = 0;
tmp += len;
/* Name followed by '='? Grab value. */
if (*tmp == '=') {
u32 vlen;
u8 save, quote = 0;
tmp++;
if (*tmp == '\'') {
quote = 1;
vlen = strcspn((char*)++tmp, "'");
} else if (*tmp == '"') {
quote = 1;
vlen = strcspn((char*)++tmp, "\"");
} else vlen = strcspn((char*)tmp, " \t\r\n>");
save = tmp[vlen];
tmp[vlen] = 0;
clean_val = html_decode_param(tmp, 0);
tmp[vlen] = save;
tmp += vlen + quote;
}
if (!strcasecmp((char*)tag_name, "script") &&
!strcasecmp((char*)param_name, "src")) remote_script = 1;
/* CHECK 3.1: URL XSS and redirection issues. */
if ((!strcasecmp((char*)param_name, "href") ||
!strcasecmp((char*)param_name, "src") ||
!strcasecmp((char*)param_name, "action") ||
(!strcasecmp((char*)param_name, "value") &&
strcasecmp((char*)tag_name, "input")) ||
!strcasecmp((char*)param_name, "codebase")) && clean_val) {
if (!strncasecmp((char*)clean_val, "skipfish://", 12))
problem(PROB_URL_XSS, req, res, tag_name, req->pivot, 0);
/* A bit hairy, but in essence, links to attacker-supplied
stylesheets or scripts are super-bad; OBJECTs and IFRAMEs
are sorta noteworthy, depending on context; and A links
are usually of little relevance. */
if (!strncasecmp((char*)clean_val, "http://skipfish.invalid/", 25) ||
!strncasecmp((char*)clean_val, "//skipfish.invalid/", 20)) {
if (!strcasecmp((char*)tag_name, "script") ||
!strcasecmp((char*)tag_name, "link"))
problem(PROB_USER_URL_ACT, req, res, tag_name, req->pivot, 0);
else if (!strcasecmp((char*)tag_name, "a"))
problem(PROB_USER_LINK, req, res, tag_name, req->pivot, 0);
else
problem(PROB_USER_URL, req, res, tag_name, req->pivot, 0);
}
}
/* CHECK 3.2: META REFRESH XSSes, redirection. Also extract
charset, if available */
if (!strcasecmp((char*)tag_name, "meta") &&
!strcasecmp((char*)param_name, "content") && clean_val) {
u8* url = inl_strcasestr(clean_val, (u8*)"URL=");
u8 semi_safe = 0;
if (url) {
url += 4;
if (*url == '\'' || *url == '"') { url++; semi_safe = 1; }
if (!strncasecmp((char*)url, "http://skipfish.invalid/", 25) ||
!strncasecmp((char*)url, "//skipfish.invalid/", 20))
problem(PROB_URL_REDIR, req, res, (u8*)"injected URL in META refresh",
req->pivot, 0);
/* Unescaped semicolon in Refresh headers is unsafe with MSIE6. */
if (!strncasecmp((char*)url, "skipfish://", 12) ||
(!semi_safe && strchr((char*)url, ';')))
problem(PROB_URL_XSS, req, res, (u8*)"injected URL in META refresh",
req->pivot, 0);
} else {
u8* cset = inl_strcasestr(clean_val, (u8*)"charset=");
if (cset) {
if (res->meta_charset) {
if (strcasecmp((char*)cset+8, (char*)res->meta_charset))
res->warn |= WARN_CFL_HDR;
} else res->meta_charset = ck_strdup(cset + 8);
}
}
}
/* CHECK 3.3: JavaScript on*=, CSS style= parameters. */
if ((!strncasecmp((char*)param_name, "on", 2) ||
!strcasecmp((char*)param_name, "style")) && clean_val)
check_js_xss(req, res, clean_val);
/* CHECK 3.4: What looks like our sfi tags, not fully escaped. */
if ((sfi_pos = (u8*)strstr((char*)param_name, "sfi")) &&
sscanf((char*)sfi_pos, "sfi%06uv%06u", &tag_id, &scan_id) == 2) {
struct http_request* orig = get_xss_request(tag_id, scan_id);
if (orig)
problem(PROB_BODY_XSS, orig, res, (u8*)
"injected 'sfi..' parameter value in a tag",
req->pivot, 0);
else
problem(PROB_BODY_XSS, req, res, (u8*)
"injected 'sfi...' parameter value in a tag (from previous"
" scans)", req->pivot, 0);
}
ck_free(clean_val);
ck_free(param_name);
}
/* CHECK 3.5: Phew. Parameters analyzed. Let's check for XSS tags... */
if (sscanf((char*)tag_name, "sfi%06uv%06u", &tag_id, &scan_id) == 2) {
struct http_request* orig = get_xss_request(tag_id, scan_id);
if (orig)
problem(PROB_BODY_XSS, orig, res, (u8*)
"injected '<sfi...>' tag seen in HTML", req->pivot, 0);
else
problem(PROB_BODY_XSS, req, res, (u8*)
"injected '<sfi...>' tag seen in HTML (from previous scans)",
req->pivot, 0);
}
/* CHECK 3.6: Non-remote SCRIPTs are of interest to JS XSS logic. */
if (!strcasecmp((char*)tag_name, "script") && !remote_script) {
u8* next = inl_strcasestr(tmp, (u8*)"</script>");
if (next) *next = 0;
check_js_xss(req, res, tmp);
if (next) *next = '<';
/* Don't skip right away, as there might be some nested HTML inside. */
}
/* CHECK 3.7: ...and so are stylesheets. */
if (!strcasecmp((char*)tag_name, "style")) {
u8* next = inl_strcasestr(tmp, (u8*)"</style>");
if (next) *next = 0;
check_js_xss(req, res, tmp);
if (next) *next = '<';
}
ck_free(tag_name);
} else tmp = (u8*)strchr((char*)tmp, '<');
} while (tmp && *tmp);
/* CHECK 4: Known exceptions / error pages, etc. */
check_for_stuff(req, res);
binary_checks:
detect_mime(req, res);
res->sniffed_mime = (u8*)mime_map[res->sniff_mime_id][0];
/* No MIME checks on Content-Disposition: attachment responses. */
if ((tmp = GET_HDR((u8*)"Content-Disposition", &res->hdr)) &&
inl_strcasestr(tmp, (u8*)"attachment")) return;
if (!relaxed_mime) {
/* CHECK 5A: Renderable documents that are not CSS or static JS are of
particular interest when it comes to MIME / charset mistakes. */
if (is_mostly_ascii(res) && !is_css(res) && (!is_javascript(res) ||
(!strstr((char*)res->payload, "function ") &&
!strstr((char*)res->payload, "function(")))) high_risk = 1;
} else {
/* CHECK 5B: Documents with skipfish signature strings echoed back
are of particular interest when it comes to MIME / charset mistakes. */
u8* tmp = (u8*)strstr((char*)res->payload, "sfi");
if ((tmp && isdigit(tmp[3]) && tmp[9] == 'v') ||
strstr((char*)res->payload, "sfish") ||
strstr((char*)res->payload, "skipfish")) high_risk = 1;
}
/* CHECK 6: MIME mismatch? Ignore cases where the response had a valid
MIME type declared in headers, but we failed to map it to a known
value... and also failed to sniff. */
if (res->sniff_mime_id != res->decl_mime_id &&
!(res->header_mime && !res->decl_mime_id &&
(res->sniff_mime_id == MIME_ASC_GENERIC ||
res->sniff_mime_id == MIME_BIN_GENERIC)))
problem(high_risk ? PROB_BAD_MIME_DYN : PROB_BAD_MIME_STAT,
req, res, res->sniffed_mime, req->pivot, 0);
/* CHECK 7: application/octet-stream or text/plain; both have
unintended consequences (but complain only if 3 didn't fire). */
else if (res->header_mime && (!strcasecmp((char*)res->header_mime,
"application/octet-stream") || !strcasecmp((char*)res->header_mime,
"text/plain")))
problem(high_risk ? PROB_GEN_MIME_DYN : PROB_GEN_MIME_STAT,
req, res, res->sniffed_mime, req->pivot, 0);
/* CHECK 8: Missing charset? */
if (is_mostly_ascii(res) && !res->meta_charset && !res->header_charset)
problem(high_risk ? PROB_BAD_CSET_DYN : PROB_BAD_CSET_STAT,
req, res, 0, req->pivot, 0);
/* CHECK 9: Duplicate, inconsistent C-T or charset? */
if (is_mostly_ascii(res) && (res->warn & WARN_CFL_HDR ||
(res->meta_charset && res->header_charset &&
strcasecmp((char*)res->meta_charset, (char*)res->header_charset))))
problem(high_risk ? PROB_CFL_HDRS_DYN : PROB_CFL_HDRS_STAT,
req, res, 0, req->pivot, 0);
/* CHECK 10: Made up charset? */
if (res->header_charset || res->meta_charset) {
u32 i = 0;
while (valid_charsets[i]) {
if (!strcasecmp((char*)valid_charsets[i], (char*)(res->header_charset ?
res->header_charset : res->meta_charset))) break;
i++;
}
if (!valid_charsets[i])
problem(high_risk ? PROB_BAD_CSET_DYN : PROB_BAD_CSET_STAT,
req, res, res->header_charset ?
res->header_charset : res->meta_charset, req->pivot, 0);
}
}
/* Does MIME detection on a message. Most of this logic is reused from
ratproxy, with some improvements and additions. */
static void detect_mime(struct http_request* req, struct http_response* res) {
u8 sniffbuf[SNIFF_LEN];
if (res->sniff_mime_id) return;
/* First, classify declared response MIME, if any. */
if (res->header_mime) {
u32 i;
for (i=0;i<MIME_COUNT;i++) {
u32 j = 0;
/* Leading ? means we need to do a prefix match. */
while (mime_map[i][j]) {
if (mime_map[i][j][0] == '?') {
if (!strncasecmp((char*)mime_map[i][j] + 1, (char*)res->header_mime,
strlen((char*)mime_map[i][j] + 1))) break;
} else {
if (!strcasecmp((char*)mime_map[i][j], (char*)res->header_mime))
break;
}
j++;
}
if (mime_map[i][j]) break;
}
if (i != MIME_COUNT) res->decl_mime_id = i;
}
/* Next, work out the actual MIME that should be set. Mostly
self-explanatory. */
memcpy(sniffbuf, res->payload,
(res->pay_len > SNIFF_LEN - 1) ? (SNIFF_LEN - 1) : res->pay_len);
sniffbuf[SNIFF_LEN - 1] = 0;
if (is_mostly_ascii(res)) {
/* ASCII checks. */
if (is_javascript(res)) {
res->sniff_mime_id = MIME_ASC_JAVASCRIPT;
return;
}
if (is_css(res)) {
res->sniff_mime_id = MIME_ASC_CSS;
return;
}
if (!strncmp((char*)sniffbuf, "%!PS", 4)) {
res->sniff_mime_id = MIME_ASC_POSTSCRIPT;
return;
}
if (!strncmp((char*)sniffbuf, "{\\rtf", 5)) {
res->sniff_mime_id = MIME_ASC_RTF;
return;
}
/* Adobe PDF (may be mostly ASCII in some cases). */
if (!strncmp((char*)sniffbuf, "%PDF", 4)) {
res->sniff_mime_id = MIME_EXT_PDF;
return;
}
/* Several types of XML documents, taking into account that
they might be missing their xmlns=, etc: */
if (strstr((char*)sniffbuf, "<OpenSearch")) {
res->sniff_mime_id = MIME_XML_OPENSEARCH;
return;
}
if (strstr((char*)sniffbuf, "<channel>") ||
strstr((char*)sniffbuf, "<description>") ||
strstr((char*)sniffbuf, "<item>") ||
strstr((char*)sniffbuf, "<rdf:RDF") ||
strstr((char*)sniffbuf, "<rss")) {
res->sniff_mime_id = MIME_XML_RSS;
return;
}
if (strstr((char*)sniffbuf, "<feed") ||
strstr((char*)sniffbuf, "<updated>")) {
res->sniff_mime_id = MIME_XML_ATOM;
return;
}
if (strstr((char*)sniffbuf, "<wml") ||
inl_strcasestr(sniffbuf, (u8*)"<!DOCTYPE wml ")) {
res->sniff_mime_id = MIME_XML_WML;
return;
}
if (strstr((char*)sniffbuf, "<svg")) {
res->sniff_mime_id = MIME_XML_SVG;
return;
}
if (strstr((char*)sniffbuf, "<cross-domain-policy>")) {
res->sniff_mime_id = MIME_XML_CROSSDOMAIN;
return;
}
if (strstr((char*)sniffbuf, "<?xml")) {
if (inl_strcasestr(sniffbuf, (u8*)"<!DOCTYPE html"))
res->sniff_mime_id = MIME_XML_XHTML;
else
res->sniff_mime_id = MIME_XML_GENERIC;
return;
}
/* Do an unconvincing check for HTML once we ruled out
known XML cases. */
if (inl_strcasestr(sniffbuf, (u8*)"<html") ||
inl_strcasestr(sniffbuf, (u8*)"<meta") ||
inl_strcasestr(sniffbuf, (u8*)"<head") ||
inl_strcasestr(sniffbuf, (u8*)"<title") ||
inl_strcasestr(sniffbuf, (u8*)"<body") ||
inl_strcasestr(sniffbuf, (u8*)"<!doctype") ||
inl_strcasestr(sniffbuf, (u8*)"<--") ||
inl_strcasestr(sniffbuf, (u8*)"<style") ||
inl_strcasestr(sniffbuf, (u8*)"<script") ||
inl_strcasestr(sniffbuf, (u8*)"<font") ||
inl_strcasestr(sniffbuf, (u8*)"<span") ||
inl_strcasestr(sniffbuf, (u8*)"<div") ||
inl_strcasestr(sniffbuf, (u8*)"<img") ||
inl_strcasestr(sniffbuf, (u8*)"<br") ||
inl_strcasestr(sniffbuf, (u8*)"<td") ||
inl_strcasestr(sniffbuf, (u8*)"<h1") ||
inl_strcasestr(sniffbuf, (u8*)"<li") ||
inl_strcasestr(sniffbuf, (u8*)"href=")) {
res->sniff_mime_id = MIME_ASC_HTML;
return;
}
/* OK, we're out of ideas. Let's do a last-resort check for XML again,
now that HTML is also off the table. */
if (strstr((char*)sniffbuf, "<![CDATA[") ||
strstr((char*)sniffbuf, "</") || strstr((char*)sniffbuf, "/>")) {
res->sniff_mime_id = MIME_XML_GENERIC;
return;
}
res->sniff_mime_id = MIME_ASC_GENERIC;
} else {
/* Binary checks. Start with simple images (JPG, GIF, PNG, TIFF, BMP). */
if (sniffbuf[0] == 0xFF && sniffbuf[1] == 0xD8 && sniffbuf[2] == 0xFF) {
res->sniff_mime_id = MIME_IMG_JPEG;
return;
}
if (!strncmp((char*)sniffbuf, "GIF8", 4)) {
res->sniff_mime_id = MIME_IMG_GIF;
return;
}
if (sniffbuf[0] == 0x89 && !strncmp((char*)sniffbuf + 1, "PNG", 3)) {
res->sniff_mime_id = MIME_IMG_PNG;
return;
}
if (!strncmp((char*)sniffbuf, "BM", 2)) {
res->sniff_mime_id = MIME_IMG_BMP;
return;
}
if (!strncmp((char*)sniffbuf, "II", 2) && sniffbuf[2] == 42 /* dec */) {
res->sniff_mime_id = MIME_IMG_TIFF;
return;
}
/* Next: RIFF containers (AVI, ANI, WAV). */
if (!strncmp((char*)sniffbuf, "RIFF", 4)) {
if (sniffbuf[8] == 'A') {
if (sniffbuf[9] == 'C')
res->sniff_mime_id = MIME_IMG_ANI;
else
res->sniff_mime_id = MIME_AV_AVI;
} else res->sniff_mime_id = MIME_AV_WAV;
return;
}
/* Cursor / ICO drama (we roll it back into BMP, because few sites
make the distinction anyway, and cursors are unlikely to be
attacker-supplied)... */
if (res->pay_len > 3 && !sniffbuf[0] && !sniffbuf[1] &&
sniffbuf[2] && !sniffbuf[3]) {
res->sniff_mime_id = MIME_IMG_BMP;
return;
}
/* Windows Media container (WMV, WMA, ASF). */
if (sniffbuf[0] == 0x30 && sniffbuf[1] == 0x26 && sniffbuf[2] == 0xB2) {
res->sniff_mime_id = MIME_AV_WMEDIA;
return;
}
/* MPEG formats, Ogg Vorbis, QuickTime, RealAudio, RealVideo. */
if (sniffbuf[0] == 0xFF && sniffbuf[1] == 0xFB) {
res->sniff_mime_id = MIME_AV_MP3;
return;
}
if (sniffbuf[0] == 0x00 && sniffbuf[1] == 0x00 && sniffbuf[2] == 0x01 &&
(sniffbuf[3] >> 4) == 0x0B) {
res->sniff_mime_id = MIME_AV_MPEG;
return;
}
if (!strncasecmp((char*)sniffbuf, "OggS", 4)) {
res->sniff_mime_id = MIME_AV_OGG;
return;
}
if (sniffbuf[0] == 0x28 && !strncasecmp((char*)sniffbuf + 1, "RMF", 3)) {
res->sniff_mime_id = MIME_AV_RA;
return;
}
if (sniffbuf[0] == 0x2E && !strncasecmp((char*)sniffbuf + 1, "RMF", 3)) {
res->sniff_mime_id = MIME_AV_RV;
return;
}
if (!strncmp((char*)sniffbuf + 4, "free", 4) ||
!strncmp((char*)sniffbuf + 4, "mdat", 4) ||
!strncmp((char*)sniffbuf + 4, "wide", 4) ||
!strncmp((char*)sniffbuf + 4, "pnot", 4) ||
!strncmp((char*)sniffbuf + 4, "skip", 4) ||
!strncmp((char*)sniffbuf + 4, "moov", 4)) {
/* Oookay, that was weird... */
res->sniff_mime_id = MIME_AV_QT;
return;
}
/* Flash and FLV. */
if (!strncmp((char*)sniffbuf, "FLV", 3)) {
res->sniff_mime_id = MIME_AV_FLV;
return;
}
if (!strncmp((char*)sniffbuf, "FCWS", 4) ||
!strncmp((char*)sniffbuf, "CWS", 3)) {
res->sniff_mime_id = MIME_EXT_FLASH;
return;
}
/* Adobe PDF. */
if (!strncmp((char*)sniffbuf, "%PDF", 4)) {
res->sniff_mime_id = MIME_EXT_PDF;
return;
}
/* JAR versus ZIP. A bit tricky, because, well, they are both just
ZIP archives. */
if (!strncmp((char*)sniffbuf, "PK", 2) &&
sniffbuf[2] < 6 && sniffbuf[3] < 7) {
if (inl_memmem(res->payload, res->pay_len, "META-INF/", 9))
res->sniff_mime_id = MIME_EXT_JAR;
else
res->sniff_mime_id = MIME_BIN_ZIP;
return;
}
/* Java class files. */
if (sniffbuf[0] == 0xCA && sniffbuf[1] == 0xFE && sniffbuf[2] == 0xBA &&
sniffbuf[3] == 0xBE) {
res->sniff_mime_id = MIME_EXT_CLASS;
return;
}
/* The joy of Microsoft Office containers. */
if (res->pay_len > 512 && sniffbuf[0] == 0xD0 && sniffbuf[1] == 0xCF &&
sniffbuf[2] == 0x11 && sniffbuf[3] == 0xE0) {
switch (sniffbuf[512]) {
case 0xEC: res->sniff_mime_id = MIME_EXT_WORD; return;
case 0xFD:
case 0x09: res->sniff_mime_id = MIME_EXT_EXCEL; return;
case 0x00:
case 0x0F:
case 0xA0: res->sniff_mime_id = MIME_EXT_PPNT; return;
}
}
/* GZIP. Unfortunately, tar has no discernible header to speak of,
so we just let it slide - few sites are serving tars on purpose
anyway. */
if (sniffbuf[0] == 0x1F && sniffbuf[1] == 0x8B && sniffbuf[2] == 0x08) {
res->sniff_mime_id = MIME_BIN_GZIP;
return;
}
/* CAB. */
if (sniffbuf[0] == 'M' && sniffbuf[1] == 'S' && sniffbuf[2] == 'C' &&
sniffbuf[3] == 'F' && !sniffbuf[4]) {
res->sniff_mime_id = MIME_BIN_CAB;
return;
}
res->sniff_mime_id = MIME_BIN_GENERIC;
}
/* No more ideas? */
}
/* "Stuff" means various error messages and cool, unusual content. For large
files, this function may be sort-of expensive, but we need to look through
the entire response, as not all messages are full-page errors, etc. */
static void check_for_stuff(struct http_request* req,
struct http_response* res) {
u8 sniffbuf[SNIFF_LEN];
u8* tmp;
if (!res->pay_len || !is_mostly_ascii(res) || res->stuff_checked) return;
/* We will use sniffbuf for checks that do not need to look through the
entire file. */
memcpy(sniffbuf, res->payload,
(res->pay_len > SNIFF_LEN - 1) ? (SNIFF_LEN - 1) : res->pay_len);
sniffbuf[SNIFF_LEN - 1] = 0;
res->stuff_checked = 1;
/* Assorted interesting error messages. */
if (strstr((char*)res->payload, "<font face=\"Arial\" size=2>error '")) {
problem(PROB_ERROR_POI, req, res, (u8*)"Microsoft runtime error",
req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "<span><H1>Server Error in '")) {
problem(PROB_ERROR_POI, req, res, (u8*)
"ASP.NET Yellow Screen of Death", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "<title>JRun Servlet Error</title>")) {
problem(PROB_ERROR_POI, req, res, (u8*)"JRun servlet error", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "Exception in thread \"") ||
strstr((char*)res->payload, "at java.lang.") ||
(strstr((char*)res->payload, "\tat ") &&
(strstr((char*)res->payload, ".java:")))) {
problem(PROB_ERROR_POI, req, res, (u8*)"Java exception trace", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "<b>Fatal error</b>:")) {
problem(PROB_ERROR_POI, req, res, (u8*)"PHP error", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "<b>Warning</b>: MySQL: ") ||
strstr((char*)res->payload, "java.sql.SQLException")) {
problem(PROB_ERROR_POI, req, res, (u8*)"SQL server error", req->pivot, 0);
return;
}
if ((tmp = (u8*)strstr((char*)res->payload, "ORA-")) &&
isdigit(tmp[4]) && tmp[9] == ':') {
problem(PROB_ERROR_POI, req, res, (u8*)"Oracle server error", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "[an error occurred while processing")) {
problem(PROB_ERROR_POI, req, res, (u8*)"SHTML error", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "Traceback (most recent call last):")) {
problem(PROB_ERROR_POI, req, res, (u8*)"Python error", req->pivot, 0);
return;
}
/* Interesting files. */
if (strstr((char*)res->payload, "ADDRESS=(PROTOCOL=")) {
problem(PROB_FILE_POI, req, res, (u8*)"SQL configuration or logs", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "<cross-domain-policy>")) {
problem(PROB_FILE_POI, req, res, (u8*)
"Flash cross-domain policy", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "<access-policy>")) {
problem(PROB_FILE_POI, req, res, (u8*)"Silverlight cross-domain policy",
req->pivot, 0);
return;
}
if (inl_strcasestr(sniffbuf, (u8*)"\nAuthType ") ||
inl_strcasestr(sniffbuf, (u8*)"\nOptions ") ||
inl_strcasestr(sniffbuf, (u8*)"\n<Directory ") ||
inl_strcasestr(sniffbuf, (u8*)"\nRequire ")) {
problem(PROB_FILE_POI, req, res, (u8*)"Apache config file", req->pivot, 0);
return;
}
if (res->sniff_mime_id == MIME_ASC_GENERIC) {
u8* x = sniffbuf;
/* Generic something:something[:...] password syntax. */
while (*x && (isalnum(*x) || strchr("._-+$", *x)) &&
(x - sniffbuf) < 64) x++;
if (x != sniffbuf && *x == ':') {
x++;
while (*x && (isalnum(*x) || strchr("./*!+=$", *x)) &&
(x - sniffbuf) < 128) x++;
if (*x == ':' || !*x || isspace(*x))
problem(PROB_FILE_POI, req, res, (u8*)
"Possible password file", req->pivot, 0);
}
}
if (inl_strcasestr(sniffbuf, (u8*)"\nDisallow:") ||
inl_strcasestr(sniffbuf, (u8*)"\rDisallow:")) {
problem(PROB_FILE_POI, req, res, (u8*)"robots.txt ruleset", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "<web-app")) {
problem(PROB_FILE_POI, req, res, (u8*)"web.xml config file", req->pivot, 0);
return;
}
if (res->sniff_mime_id == MIME_ASC_GENERIC) {
u8* x = sniffbuf;
u32 slashes = 0;
/* Five slashes in the first line in a plaintext file should be a
reasonably good check for CVS. */
while (*x && *x != '\n' &&
(x - sniffbuf) < 256) {
if (*x == '/') slashes++;
x++;
}
if (slashes == 5)
problem(PROB_FILE_POI, req, res, (u8*)"CVS RCS data", req->pivot, 0);
}
if (strstr((char*)sniffbuf, "<wc-entries") ||
strstr((char*)sniffbuf, "svn:special svn:")) {
problem(PROB_FILE_POI, req, res, (u8*)"SVN RCS data", req->pivot, 0);
return;
}
/* This should also cover most cases of Perl, Python, etc. */
if (!strncmp((char*)sniffbuf, "#!/", 3)) {
problem(PROB_FILE_POI, req, res, (u8*)"shell script", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "<?") && strstr((char*)res->payload, "?>") &&
!strstr((char*)sniffbuf, "<?xml") && !strstr((char*)res->payload, "# ?>") &&
!strstr((char*)res->payload, "<?import")) {
problem(PROB_FILE_POI, req, res, (u8*)"PHP source", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "<%@") && strstr((char*)res->payload, "%>")) {
problem(PROB_FILE_POI, req, res, (u8*)"JSP source", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "<%") && strstr((char*)res->payload, "%>")) {
problem(PROB_FILE_POI, req, res, (u8*)"ASP source", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "\nimport java.")) {
problem(PROB_FILE_POI, req, res, (u8*)"Java source", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "\n#include")) {
problem(PROB_FILE_POI, req, res, (u8*)"C/C++ source", req->pivot, 0);
return;
}
if (strstr((char*)res->payload, "End Sub\n") ||
strstr((char*)res->payload, "End Sub\r")) {
problem(PROB_FILE_POI, req, res, (u8*)"Visual Basic source", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "0] \"GET /")) {
problem(PROB_FILE_POI, req, res, (u8*)"Apache server logs", req->pivot, 0);
return;
}
if (strstr((char*)sniffbuf, "0, GET, /")) {
problem(PROB_FILE_POI, req, res, (u8*)"IIS server logs", req->pivot, 0);
return;
}
/* Plain text, and every line contains ;, comma, or |? */
if (res->sniff_mime_id == MIME_ASC_GENERIC) {
u8* cur = res->payload;
u8 all_delim = 0;
do {
u8 *eol = (u8*)strchr((char*)cur, '\n');
u32 del = strcspn((char*)cur, ",|;\n");
if (!cur[del] || cur[del] == '\n') {
all_delim = 0;
break;
}
all_delim = 1;
cur = eol + 1;
} while (cur && *cur);
if (all_delim) {
problem(PROB_FILE_POI, req, res,
(u8*)"Delimited database dump", req->pivot, 0);
return;
}
}
/* Excel is almost always interesting on its own. */
if (res->sniff_mime_id == MIME_EXT_EXCEL) {
problem(PROB_FILE_POI, req, res, (u8*)"Excel spreadsheet", req->pivot, 0);
return;
}
/* This is a bit dodgy, but the most prominent sign of non-browser JS on
Windows is the instantiation of obscure ActiveX objects to access local
filesystem, create documents, etc. Unfortunately, some sites may also be
creating obscure ActiveX objects; these would likely need to be just
blacklisted here. */
if (is_javascript(res) && strstr((char*)res->payload, "new ActiveXObject(") &&
!strstr((char*)res->payload, "XMLHTTP") &&
!strstr((char*)res->payload, "ShockwaveFlash")) {
problem(PROB_FILE_POI, req, res, (u8*)"server-side JavaScript source",
req->pivot, 0);
return;
}
/* Three very lame rules follow; help improve. */
if (inl_strcasestr(res->payload, (u8*)"\nCREATE TABLE") ||
inl_strcasestr(res->payload, (u8*)"\nSELECT * FROM") ||
inl_strcasestr(res->payload, (u8*)"\nDROP TABLE")) {
problem(PROB_FILE_POI, req, res, (u8*)"SQL script", req->pivot, 0);
return;
}
if (inl_strcasestr(sniffbuf, (u8*)"@echo ")) {
problem(PROB_FILE_POI, req, res, (u8*)"DOS batch script", req->pivot, 0);
return;
}
if (inl_strcasestr(res->payload, (u8*)"(\"Wscript.")) {
problem(PROB_FILE_POI, req, res, (u8*)"Windows shell script", req->pivot, 0);
return;
}
}