skipfish/database.c

1357 lines
35 KiB
C

/*
skipfish - database & crawl management
--------------------------------------
Author: Michal Zalewski <lcamtuf@google.com>
Copyright 2009, 2010 by Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#define _VIA_DATABASE_C
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include "debug.h"
#include "config.h"
#include "types.h"
#include "http_client.h"
#include "database.h"
#include "crawler.h"
#include "string-inl.h"
struct pivot_desc root_pivot;
u8 **deny_urls, /* List of banned URL substrings */
**deny_strings, /* List of banned page substrings */
**allow_urls, /* List of required URL substrings */
**allow_domains, /* List of allowed vhosts */
**trust_domains; /* List of trusted vhosts */
u32 num_deny_urls,
num_deny_strings,
num_allow_urls,
num_allow_domains,
num_trust_domains;
u32 max_depth = MAX_DEPTH,
max_children = MAX_CHILDREN,
max_guesses = MAX_GUESSES;
u8 dont_add_words; /* No auto dictionary building */
struct kw_entry {
u8* word; /* Keyword itself */
u32 hit_cnt; /* Number of confirmed sightings */
u8 is_ext; /* Is an extension? */
u8 hit_already; /* Had its hit count bumped up? */
u32 total_age; /* Total age (in scan cycles) */
u32 last_age; /* Age since last hit */
};
static struct kw_entry*
keyword[WORD_HASH]; /* Keyword collection (bucketed) */
static u32 keyword_cnt[WORD_HASH]; /* Per-bucket keyword counts */
static u8 **extension, /* Extension list */
**guess; /* Keyword candidate list */
static u32 guess_cnt, /* Number of keyword candidates */
extension_cnt, /* Number of extensions */
keyword_total_cnt, /* Current keyword count */
keyword_orig_cnt; /* At-boot keyword count */
static u32 cur_xss_id, scan_id; /* Stored XSS manager IDs */
static struct http_request** xss_req; /* Stored XSS manager req cache */
/* Maps a parsed URL (in req) to the pivot tree, creating or modifying nodes
as necessary, and scheduling them for crawl. This should be called only
on requests that were *not* yet retrieved. */
void maybe_add_pivot(struct http_request* req, struct http_response* res,
u8 via_link) {
struct pivot_desc *cur = NULL;
u32 i, par_cnt = 0, path_cnt = 0, pno;
u8 ends_with_slash = 0;
#ifdef LOG_STDERR
u8* url = serialize_path(req, 1, 1);
DEBUG("--- New pivot requested: %s (%d) --\n", url, via_link);
ck_free(url);
#endif /* LOG_STDERR */
if (!req) FATAL("Invalid request data.");
/* Initialize root pivot if not done already. */
if (!root_pivot.type) {
root_pivot.type = PIVOT_ROOT;
root_pivot.state = PSTATE_DONE;
root_pivot.linked = 2;
root_pivot.fuzz_par = -1;
root_pivot.name = ck_strdup((u8*)"[root]");
}
if (!url_allowed(req)) { url_scope++; return; }
/* Count the number of path and query parameters in the request. */
for (i=0;i<req->par.c;i++) {
if (QUERY_SUBTYPE(req->par.t[i]) || POST_SUBTYPE(req->par.t[i])) par_cnt++;
if (PATH_SUBTYPE(req->par.t[i])) {
if (req->par.t[i] == PARAM_PATH && !req->par.n[i] && !req->par.v[i][0])
ends_with_slash = 0; else ends_with_slash = 1;
path_cnt++;
}
/* While we're at it, try to learn new keywords. */
if (PATH_SUBTYPE(req->par.t[i]) || QUERY_SUBTYPE(req->par.t[i])) {
if (req->par.n[i]) wordlist_confirm_word(req->par.n[i]);
wordlist_confirm_word(req->par.v[i]);
}
}
/* Try to find pivot point for the host. */
for (i=0;i<root_pivot.child_cnt;i++) {
cur = root_pivot.child[i];
if (!strcasecmp((char*)cur->req->host, (char*)req->host) &&
cur->req->port == req->port &&
cur->req->proto == req->proto) break;
}
if (i == root_pivot.child_cnt) {
/* No server pivot found, we need to create one. */
cur = ck_alloc(sizeof(struct pivot_desc));
root_pivot.child = ck_realloc(root_pivot.child,
(root_pivot.child_cnt + 1) * sizeof(struct pivot_desc*));
root_pivot.child[root_pivot.child_cnt++] = cur;
cur->type = PIVOT_SERV;
cur->state = PSTATE_FETCH;
cur->linked = 2;
cur->fuzz_par = -1;
cur->parent = &root_pivot;
/* Copy the original request, sans path. Create a dummy
root dir entry instead. Derive pivot name by serializing
the URL of the associated stub request. */
cur->req = req_copy(req, cur, 0);
set_value(PARAM_PATH, NULL, (u8*)"", -1, &cur->req->par);
cur->name = serialize_path(cur->req, 1, 0);
cur->req->callback = fetch_dir_callback;
/* If matching response not provided, schedule request. */
if (res && !par_cnt && path_cnt == 1) {
cur->res = res_copy(res);
fetch_dir_callback(req, cur->res);
} else async_request(cur->req);
wordlist_confirm_word(req->host);
}
/* One way or the other, 'cur' now points to server pivot. Let's
walk through all path elements, and follow or create sub-pivots
for them. */
pno = 0;
for (i=0;i<path_cnt;i++) {
u8* pname;
u32 c, ccnt;
while (!PATH_SUBTYPE(req->par.t[pno])) pno++;
/* Bail out on the trailing NULL-'' indicator, if present. It is
used to denote a directory, and will always be the last path
element. */
if (i == path_cnt - 1 && req->par.t[pno] == PARAM_PATH &&
!req->par.n[pno] && !req->par.v[pno][0]) break;
pname = req->par.n[pno] ? req->par.n[pno] : req->par.v[pno];
ccnt = cur->child_cnt;
/* Try to find a matching node. */
for (c=0;c<ccnt;c++)
if (!(is_c_sens(cur) ? strcmp : strcasecmp)((char*)pname,
(char*)cur->child[c]->name)) {
cur = cur->child[c];
if (cur->linked < via_link) cur->linked = via_link;
break;
}
if (c == ccnt) {
/* Node not found. We need to create one. */
struct pivot_desc* n;
/* Enforce user limits. */
if ((i + 1) >= max_depth || cur->child_cnt > max_children)
return;
/* Create and link back to parent. */
n = ck_alloc(sizeof(struct pivot_desc));
cur->child = ck_realloc(cur->child, (cur->child_cnt + 1) *
sizeof(struct pivot_desc*));
cur->child[cur->child_cnt++] = n;
n->parent = cur;
n->linked = via_link;
n->name = ck_strdup(pname);
/* Copy the original request, then copy over path up to the
current point. */
n->req = req_copy(req, n, 0);
for (c=0;c<=pno;c++)
if (PATH_SUBTYPE(req->par.t[c]))
set_value(req->par.t[c], req->par.n[c], req->par.v[c], -1,
&n->req->par);
/* If name is parametric, indicate which parameter to fuzz. */
if (req->par.n[pno]) n->fuzz_par = n->req->par.c - 1;
else n->fuzz_par = -1;
/* Do not fuzz out-of-scope or limit exceeded dirs... */
if ((i + 1) == max_depth - 1) n->no_fuzz = 1;
if (i != path_cnt - 1) {
/* This is not the last path segment, so let's assume a "directory"
(hierarchy node, to be more accurate), and schedule directory
tests. */
set_value(PARAM_PATH, NULL, (u8*)"", -1, &n->req->par);
n->type = PIVOT_DIR;
n->req->callback = fetch_dir_callback;
if (!url_allowed(n->req)) n->no_fuzz = 2;
/* Subdirectory tests require parent directory 404 testing to complete
first. If these are still pending, wait a bit. */
if (cur->state > PSTATE_IPS_CHECK) {
n->state = PSTATE_FETCH;
/* If this actually *is* the last parameter, taking into account the
early-out hack mentioned above, and we were offered a response -
make use of it and don't schedule a new request. */
if (i == path_cnt - 2 && ends_with_slash && res) {
n->res = res_copy(res);
fetch_dir_callback(n->req, n->res);
} else async_request(n->req);
} else n->state = PSTATE_PENDING;
} else {
/* Last segment. If no parameters, copy response body, mark type as
"unknown", schedule extra checks. */
if (!url_allowed(n->req)) n->no_fuzz = 2;
if (!par_cnt) {
n->type = PIVOT_UNKNOWN;
n->res = res_copy(res);
n->req->callback = fetch_unknown_callback;
if (cur->state > PSTATE_IPS_CHECK) {
n->state = PSTATE_FETCH;
/* If we already have a response, call the callback directly
(it will schedule further requests on its own). */
if (!res) {
n->state = PSTATE_FETCH;
async_request(n->req);
} else fetch_unknown_callback(n->req, n->res);
} else n->state = PSTATE_PENDING;
} else {
/* Parameters found. Assume file, schedule a fetch. */
n->type = PIVOT_FILE;
n->req->callback = fetch_file_callback;
if (cur->state > PSTATE_IPS_CHECK) {
n->state = PSTATE_FETCH;
async_request(n->req);
} else n->state = PSTATE_PENDING;
}
}
cur = n;
}
/* At this point, 'cur' points to a newly created or existing node
for the path element. If this element is parametric, make sure
that its value is on the 'try' list. */
if (req->par.n[pno]) {
for (c=0;c<cur->try_cnt;c++)
if (cur->try_list[c] && !(is_c_sens(cur) ? strcmp : strcasecmp)
((char*)req->par.v[pno], (char*)cur->try_list[c])) break;
/* Not found on the list - try adding. */
if (c == cur->try_cnt) {
cur->try_list = ck_realloc(cur->try_list, (cur->try_cnt + 1) *
sizeof(u8*));
cur->try_list[cur->try_cnt++] = ck_strdup(req->par.v[pno]);
if (cur->state == PSTATE_DONE)
crawl_par_trylist_init(cur);
}
}
pno++;
}
/* Phew! At this point, 'cur' points to the final path element, and now,
we just need to take care of parameters. Each parameter has its own
pivot point, and a full copy of the request. */
pno = 0;
for (i=0;i<par_cnt;i++) {
u8* pname;
u32 c, ccnt;
while (!QUERY_SUBTYPE(req->par.t[pno]) && !POST_SUBTYPE(req->par.t[pno]))
pno++;
pname = req->par.n[pno] ? req->par.n[pno] : (u8*)"[blank]";
ccnt = cur->child_cnt;
/* Try to find a matching node. */
for (c=0;c<ccnt;c++)
if (!(is_c_sens(cur) ? strcmp : strcasecmp)((char*)pname,
(char*)cur->child[c]->name)) {
cur = cur->child[c];
if (cur->linked < via_link) cur->linked = via_link;
break;
}
if (c == ccnt) {
/* Node not found. We need to create one. */
struct pivot_desc* n;
/* Enforce user limits. */
if (cur->child_cnt > max_children) {
problem(PROB_LIMITS, req, res, (u8*)"Child node limit exceeded", cur, 0);
return;
}
/* Create and link back to parent. */
n = ck_alloc(sizeof(struct pivot_desc));
cur->child = ck_realloc(cur->child, (cur->child_cnt + 1) *
sizeof(struct pivot_desc*));
cur->child[cur->child_cnt++] = n;
n->parent = cur;
n->type = PIVOT_PARAM;
n->linked = via_link;
n->name = ck_strdup(pname);
/* Copy the original request, in full. Remember not to fuzz
file inputs. */
n->req = req_copy(req, n, 1);
n->fuzz_par = req->par.t[pno] == PARAM_POST_F ? -1 : pno;
n->res = res_copy(res);
/* File fetcher does everything we need. */
n->req->callback = fetch_file_callback;
if (cur->state > PSTATE_IPS_CHECK) {
n->state = PSTATE_FETCH;
if (res) fetch_file_callback(n->req, n->res);
else async_request(n->req);
} else n->state = PSTATE_PENDING;
cur = n;
}
/* Ok, again, 'cur' is at the appropriate node. Make sure the
current value is on the 'try' list. */
for (c=0;c<cur->try_cnt;c++)
if (cur->try_list[c] && !(is_c_sens(cur) ? strcmp : strcasecmp)
((char*)req->par.v[pno], (char*)cur->try_list[c])) break;
/* Not found on the list - try adding. */
if (c == cur->try_cnt) {
cur->try_list = ck_realloc(cur->try_list, (cur->try_cnt + 1) *
sizeof(u8*));
cur->try_list[cur->try_cnt++] = ck_strdup(req->par.v[pno]);
if (cur->state == PSTATE_DONE)
crawl_par_trylist_init(cur);
}
/* Parameters are not hierarchical, so go back to the parent node. */
cur = cur->parent;
pno++;
}
/* Done, at last! */
}
/* Finds the host-level pivot point for global issues. */
struct pivot_desc* host_pivot(struct pivot_desc* pv) {
while (pv->parent && pv->parent->parent) pv = pv->parent;
return pv;
}
/* Gets case sensitivity info from the nearest DIR / SERV node. */
u8 is_c_sens(struct pivot_desc* pv) {
while (pv->parent && (pv->type != PIVOT_DIR || pv->type != PIVOT_SERV))
pv = pv->parent;
return pv->csens;
}
/* Registers a problem, if not duplicate (res, extra may be NULL): */
void problem(u32 type, struct http_request* req, struct http_response* res,
u8* extra, struct pivot_desc* pv, u8 allow_dup) {
u32 i;
if (pv->type == PIVOT_NONE) FATAL("Uninitialized pivot point");
if (type == PROB_NONE || !req) FATAL("Invalid issue data");
DEBUG("--- NEW PROBLEM - type: %u, extra: '%s' ---\n", type, extra);
/* Check for duplicates */
if (!allow_dup)
for (i=0;i<pv->issue_cnt;i++)
if (type == pv->issue[i].type && !strcmp(extra ? (char*)extra : "",
pv->issue[i].extra ? (char*)pv->issue[i].extra : "")) return;
pv->issue = ck_realloc(pv->issue, (pv->issue_cnt + 1) *
sizeof(struct issue_desc));
pv->issue[pv->issue_cnt].type = type;
pv->issue[pv->issue_cnt].extra = extra ? ck_strdup(extra) : NULL;
pv->issue[pv->issue_cnt].req = req_copy(req, pv, 1);
pv->issue[pv->issue_cnt].res = res_copy(res);
/* Mark copies of half-baked requests as done. */
if (res && res->state < STATE_OK) {
pv->issue[pv->issue_cnt].res->state = STATE_OK;
ck_free(pv->issue[pv->issue_cnt].res->payload);
pv->issue[pv->issue_cnt].res->payload =
ck_strdup((u8*)"[...truncated...]\n");
pv->issue[pv->issue_cnt].res->pay_len = 18;
}
pv->issue_cnt++;
}
/* Three functions to check if the URL is permitted under current rules
(0 = no, 1 = yes): */
u8 url_allowed_host(struct http_request* req) {
u32 i;
for (i=0;i<num_allow_domains;i++) {
if (allow_domains[i][0] == '.') {
u8* pos = inl_strcasestr(req->host, allow_domains[i]);
if (pos && strlen((char*)req->host) ==
strlen((char*)allow_domains[i]) + (pos - req->host))
return 1;
} else
if (!strcasecmp((char*)req->host, (char*)allow_domains[i]))
return 1;
}
return 0;
}
u8 url_trusted_host(struct http_request* req) {
u32 i;
i = 0;
while (always_trust_domains[i]) {
if (always_trust_domains[i][0] == '.') {
u8* pos = inl_strcasestr(req->host, (u8*)always_trust_domains[i]);
if (pos && strlen((char*)req->host) ==
strlen(always_trust_domains[i]) + (pos - req->host))
return 1;
} else
if (!strcasecmp((char*)req->host, (char*)always_trust_domains[i]))
return 1;
i++;
}
for (i=0;i<num_trust_domains;i++) {
u8* pos = inl_strcasestr(req->host, trust_domains[i]);
if (pos && strlen((char*)req->host) ==
strlen((char*)trust_domains[i]) + (pos - req->host))
return 1;
}
return 0;
}
u8 url_allowed(struct http_request* req) {
u8* url = serialize_path(req, 1, 0);
u32 i;
/* Check blacklist first */
for (i=0;i<num_deny_urls;i++)
if (inl_strcasestr(url, deny_urls[i])) {
ck_free(url);
return 0;
}
/* Check whitelist next */
if (num_allow_urls) {
u8 permit = 0;
for (i=0;i<num_allow_urls;i++)
if (inl_strcasestr(url, allow_urls[i])) {
permit = 1;
break;
}
if (!permit) { ck_free(url); return 0; }
}
ck_free(url);
return url_allowed_host(req);
}
/* Compares the checksums for two responses: */
u8 same_page(struct http_sig* sig1, struct http_sig* sig2) {
u32 i, bucket_fail = 0;
s32 total_diff = 0;
u32 total_scale = 0;
if (sig1->code != sig2->code) return 0;
for (i=0;i<FP_SIZE;i++) {
s32 diff = sig1->data[i] - sig2->data[i];
u32 scale = sig1->data[i] + sig2->data[i];
if (abs(diff) > 1 + (scale * FP_T_REL / 100) ||
abs(diff) > FP_T_ABS)
if (++bucket_fail > FP_B_FAIL) return 0;
total_diff += diff;
total_scale += scale;
}
if (abs(total_diff) > 1 + (total_scale * FP_T_REL / 100))
return 0;
return 1;
}
/* Dumps signature data: */
void dump_signature(struct http_sig* sig) {
u32 i;
DEBUG("SIG %03d: ", sig->code);
for (i=0;i<FP_SIZE;i++) DEBUG("[%04d] ", sig->data[i]);
DEBUG("\n");
}
/* Debugs signature comparison: */
void debug_same_page(struct http_sig* sig1, struct http_sig* sig2) {
#ifdef LOG_STDERR
u32 i;
s32 total_diff = 0;
u32 total_scale = 0;
dump_signature(sig1);
dump_signature(sig2);
DEBUG(" ");
for (i=0;i<FP_SIZE;i++) {
s32 diff = sig1->data[i] - sig2->data[i];
DEBUG("[%04d] ", diff);
}
DEBUG("(diff)\n ");
for (i=0;i<FP_SIZE;i++) {
s32 diff = sig1->data[i] - sig2->data[i];
u32 scale = sig1->data[i] + sig2->data[i];
if (abs(diff) > 1 + (scale * FP_T_REL / 100) ||
abs(diff) > FP_T_ABS)
DEBUG("[FAIL] "); else DEBUG("[pass] ");
total_diff += diff;
total_scale += scale;
}
DEBUG("\n ");
for (i=0;i<FP_SIZE;i++) {
u32 scale = sig1->data[i] + sig2->data[i];
DEBUG("[%04d] ", (u32)( 1 + (scale * FP_T_REL / 100)));
}
DEBUG("(allow)\n");
DEBUG("Total diff: %d, scale %d, allow %d\n",
total_diff, total_scale, 1 + (u32)(total_scale * FP_T_REL / 100));
#endif /* LOG_STDERR */
}
/* Keyword management: */
/* Word hashing helper. */
static inline u32 hash_word(u8* str) {
register u32 ret = 0;
register u8 cur;
if (str)
while ((cur=*str)) {
ret = ~ret ^ (cur) ^
(cur << 5) ^ (~cur >> 5) ^
(cur << 10) ^ (~cur << 15) ^
(cur << 20) ^ (~cur << 25) ^
(cur << 30);
str++;
}
return ret % WORD_HASH;
}
/* Adds a new keyword candidate to the global "guess" list. This
list is always case-insensitive. */
void wordlist_add_guess(u8* text) {
u32 target, i, kh;
if (dont_add_words) return;
/* Check if this is a bad or known guess or keyword. */
if (!text || !text[0] || strlen((char*)text) > MAX_WORD) return;
for (i=0;i<guess_cnt;i++)
if (!strcasecmp((char*)text, (char*)guess[i])) return;
kh = hash_word(text);
for (i=0;i<keyword_cnt[kh];i++)
if (!strcasecmp((char*)text, (char*)keyword[kh][i].word)) return;
/* Initialize guess list if necessary. */
if (!guess) guess = ck_alloc(max_guesses * sizeof(u8*));
/* See if we can add a new one, or need to nuke something. */
if (guess_cnt >= max_guesses) target = R(max_guesses);
else target = guess_cnt++;
ck_free(guess[target]);
guess[target] = ck_strdup(text);
}
/* Adds a single, sanitized keyword to the list, or increases its hit count.
Keyword list is case-insensitive - first capitalization wins. */
static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits,
u32 total_age, u32 last_age) {
u32 kh, i;
if (!text || !text[0] || strlen((char*)text) > MAX_WORD) return;
/* Check if this is a known keyword. */
kh = hash_word(text);
for (i=0;i<keyword_cnt[kh];i++)
if (!strcasecmp((char*)text, (char*)keyword[kh][i].word)) {
/* Known! Increase hit count, and if this is now
tagged as an extension, add to extension list. */
if (!keyword[kh][i].hit_already) {
keyword[kh][i].hit_cnt += add_hits;
keyword[kh][i].hit_already = 1;
keyword[kh][i].last_age = 0;
}
if (!keyword[kh][i].is_ext && is_ext) {
keyword[kh][i].is_ext = 1;
extension = ck_realloc(extension, (extension_cnt + 1) * sizeof(u8*));
extension[extension_cnt++] = keyword[kh][i].word;
}
return;
}
/* Word not known (and i == keyword_cnt[kh]). Create a new wordlist entry. */
keyword[kh] = ck_realloc(keyword[kh], (i + 1) * sizeof(struct kw_entry));
keyword_cnt[kh]++;
keyword_total_cnt++;
keyword[kh][i].word = ck_strdup(text);
keyword[kh][i].is_ext = is_ext;
keyword[kh][i].hit_cnt = add_hits;
keyword[kh][i].total_age = total_age;
keyword[kh][i].last_age = last_age;
/* If this is a new keyword (not loaded from file), mark it as hit to
avoid inflating hit_cnt. */
if (!total_age) keyword[kh][i].hit_already = 1;
if (is_ext) {
extension = ck_realloc(extension, (extension_cnt + 1) * sizeof(u8*));
extension[extension_cnt++] = keyword[kh][i].word;
}
}
/* Adds non-sanitized keywords to the list. */
void wordlist_confirm_word(u8* text) {
u32 tlen, i, dcnt = 0, too_many_dots = 0;
s32 ppos = -1;
if (dont_add_words) return;
/* Good keywords are expected to consist of A-Za-z 0-9_-~().:!^$ only.
We expect at least one non-'.' character, at most one '.', and not more
than four digits.
If they do contain a dot at a position other than 0 or end-of-string,
and the character after . is not a digit, we also extract and store an
extension (which shouldn't be longer than 12 characters or so).
This might misinterpret some TLDs as extensions (e.g, param=example.com),
but the user is unlikely to be scanning so many different ccTLDs for this
to affect the quality of the database. */
if (!text || !text[0]) return;
tlen = strlen((char*)text);
for (i=0;i<tlen;i++) {
if (!isalnum(text[i]) && !strchr(" _-~().:!^$", text[i])) return;
if (isdigit(text[0])) dcnt++;
if (text[i] == '.') {
if (ppos != -1) too_many_dots = 1;
ppos = i;
}
}
/* Too many dots? Tokenize class paths and domains as individual keywords,
still. */
if (too_many_dots) {
u8 *st = text, *en;
do {
en = (u8*)strchr((char*)st, '.');
if (en) *en = 0;
wordlist_confirm_word(st);
if (en) *en = '.';
st = en + 1;
} while (en);
return;
}
/* Too many digits? */
if (dcnt > 4) return;
if (ppos != -1) {
/* Period only? Too long? */
if (tlen == 1 || tlen - ppos > 12) return;
if (ppos && ppos != tlen - 1 && !isdigit(text[ppos] + 1)) {
wordlist_confirm_single(text + ppos + 1, 1, 1, 0, 0);
text[ppos] = 0;
wordlist_confirm_single(text, 0, 1, 0, 0);
text[ppos] = '.';
return;
}
}
wordlist_confirm_single(text, 0, 1, 0, 0);
}
/* Returns wordlist item at a specified offset (NULL if no more available). */
u8* wordlist_get_word(u32 offset) {
u32 cur_off = 0, kh;
for (kh=0;kh<WORD_HASH;kh++) {
if (cur_off + keyword_cnt[kh] > offset) break;
cur_off += keyword_cnt[kh];
}
if (kh == WORD_HASH) return NULL;
return keyword[kh][offset - cur_off].word;
}
/* Returns keyword candidate at a specified offset (or NULL). */
u8* wordlist_get_guess(u32 offset) {
if (offset >= guess_cnt) return NULL;
return guess[offset];
}
/* Returns extension at a specified offset (or NULL). */
u8* wordlist_get_extension(u32 offset) {
if (offset >= extension_cnt) return NULL;
return extension[offset];
}
/* Loads keywords from file. */
void load_keywords(u8* fname, u32 purge_age) {
FILE* in;
u32 hits, total_age, last_age, lines = 0;
u8 type;
s32 fields;
u8 kword[MAX_WORD + 1];
char fmt[32];
kword[MAX_WORD] = 0;
in = fopen((char*)fname, "r");
if (!in) {
PFATAL("Unable to open wordlist '%s'", fname);
WARN("Wordlist '%s' not found, not loaded.", fname);
return;
}
sprintf(fmt, "%%c %%u %%u %%u %%%u[^\x01-\x1f]", MAX_WORD);
while ((fields = fscanf(in, fmt, &type, &hits, &total_age, &last_age, kword))
== 5) {
if (!purge_age || last_age < purge_age)
wordlist_confirm_single(kword, (type == 'e'), hits,
total_age + 1, last_age + 1);
lines++;
fgetc(in); /* sink \n */
}
if (fields != -1 && fields != 5)
FATAL("Wordlist '%s': syntax error in line %u.\n", fname, lines + 1);
if (!lines)
WARN("Wordlist '%s' contained no valid entries.", fname);
keyword_orig_cnt = keyword_total_cnt;
fclose(in);
}
/* qsort() callback for sorting keywords in save_keywords(). */
static int keyword_sorter(const void* word1, const void* word2) {
if (((struct kw_entry*)word1)->hit_cnt < ((struct kw_entry*)word2)->hit_cnt)
return 1;
else if (((struct kw_entry*)word1)->hit_cnt ==
((struct kw_entry*)word2)->hit_cnt)
return 0;
else return -1;
}
/* Saves all keywords to a file. */
void save_keywords(u8* fname) {
struct stat st;
FILE* out;
s32 fd;
u32 i, kh;
u8* old;
#ifndef O_NOFOLLOW
#define O_NOFOLLOW 0
#endif /* !O_NOFOLLOW */
if (stat((char*)fname, &st) || !S_ISREG(st.st_mode)) return;
/* First, sort the list. */
for (kh=0;kh<WORD_HASH;kh++)
qsort(keyword[kh], keyword_cnt[kh], sizeof(struct kw_entry), keyword_sorter);
old = ck_alloc(strlen((char*)fname) + 5);
sprintf((char*)old, "%s.old", fname);
/* Ignore errors for these two. */
unlink((char*)old);
rename((char*)fname, (char*)old);
ck_free(old);
fd = open((char*)fname, O_WRONLY | O_CREAT | O_EXCL, 0644);
if (fd < 0 || !(out = fdopen(fd,"a"))) {
WARN("Unable to save new wordlist to '%s'", fname);
close(fd);
return;
}
out = fdopen(fd, "w");
for (kh=0;kh<WORD_HASH;kh++)
for (i=0;i<keyword_cnt[kh];i++)
fprintf(out,"%c %u %u %u %s\n", keyword[kh][i].is_ext ? 'e' : 'w',
keyword[kh][i].hit_cnt, keyword[kh][i].total_age,
keyword[kh][i].last_age, keyword[kh][i].word);
SAY(cLGN "[+] " cNOR "Wordlist '%s' updated (%u new words added).\n",
fname, keyword_total_cnt - keyword_orig_cnt);
fclose(out);
close(fd);
}
/* Displays pretty pivot statistics as we go. */
static u32 pivot_pending,
pivot_init,
pivot_attack,
pivot_bf,
pivot_done;
static u32 pivot_serv,
pivot_dir,
pivot_file,
pivot_pinfo,
pivot_param,
pivot_value,
pivot_missing,
pivot_unknown;
static u32 issue_cnt[6],
pivot_cnt;
static void pv_stat_crawl(struct pivot_desc* pv) {
u32 i;
pivot_cnt++;
switch (pv->type) {
case PIVOT_SERV: pivot_serv++; /* Fall through */
case PIVOT_DIR: pivot_dir++; break;
case PIVOT_FILE: pivot_file++; break;
case PIVOT_PATHINFO: pivot_pinfo++; break;
case PIVOT_UNKNOWN: pivot_unknown++; break;
case PIVOT_PARAM: pivot_param++; break;
case PIVOT_VALUE: pivot_value++; break;
}
if (pv->missing) pivot_missing++;
switch (pv->state) {
case PSTATE_PENDING: pivot_pending++; break;
case PSTATE_FETCH ... PSTATE_IPS_CHECK: pivot_init++; break;
case PSTATE_CHILD_INJECT:
case PSTATE_PAR_INJECT: pivot_attack++; break;
case PSTATE_DONE: pivot_done++; break;
default: pivot_bf++;
}
for (i=0;i<pv->issue_cnt;i++)
issue_cnt[PSEV(pv->issue[i].type)]++;
for (i=0;i<pv->child_cnt;i++)
pv_stat_crawl(pv->child[i]);
}
void database_stats() {
pivot_pending = pivot_init = pivot_attack = pivot_bf = pivot_pinfo =
pivot_done = pivot_serv = pivot_dir = pivot_file = pivot_param =
pivot_value = pivot_missing = pivot_unknown = pivot_cnt = 0;
memset(issue_cnt, 0, sizeof(issue_cnt));
pv_stat_crawl(&root_pivot);
SAY("Database statistics\n"
"-------------------\n\n"
cGRA " Pivots : " cNOR "%u total, %u done (%.02f%%) \n"
cGRA " In progress : " cNOR "%u pending, %u init, %u attacks, "
"%u dict \n"
cGRA " Missing nodes : " cNOR "%u spotted\n"
cGRA " Node types : " cNOR "%u serv, %u dir, %u file, %u pinfo, "
"%u unkn, %u par, %u val\n"
cGRA " Issues found : " cNOR "%u info, %u warn, %u low, %u medium, "
"%u high impact\n"
cGRA " Dict size : " cNOR "%u words (%u new), %u extensions, "
"%u candidates\n",
pivot_cnt, pivot_done, pivot_cnt ? ((100.0 * pivot_done) / (pivot_cnt))
: 0, pivot_pending, pivot_init, pivot_attack, pivot_bf, pivot_missing,
pivot_serv, pivot_dir, pivot_file, pivot_pinfo, pivot_unknown,
pivot_param, pivot_value, issue_cnt[1], issue_cnt[2], issue_cnt[3],
issue_cnt[4], issue_cnt[5], keyword_total_cnt, keyword_total_cnt -
keyword_orig_cnt, extension_cnt, guess_cnt);
}
/* Dumps pivot database, for debugging purposes. */
void dump_pivots(struct pivot_desc* cur, u8 nest) {
u8* indent = ck_alloc(nest + 1);
u8* url;
u32 i;
if (!cur) cur = &root_pivot;
memset(indent, ' ', nest);
SAY(cBRI "\n%s== Pivot " cLGN "%s" cBRI " [%d] ==\n",
indent, cur->name, cur->dupe);
SAY(cGRA "%sType : " cNOR, indent);
switch (cur->type) {
case PIVOT_NONE: SAY(cLRD "PIVOT_NONE (bad!)\n" cNOR); break;
case PIVOT_ROOT: SAY("PIVOT_ROOT\n"); break;
case PIVOT_SERV: SAY("PIVOT_SERV\n"); break;
case PIVOT_DIR: SAY("PIVOT_DIR\n"); break;
case PIVOT_FILE: SAY("PIVOT_FILE\n"); break;
case PIVOT_PATHINFO: SAY("PIVOT_PATHINFO\n"); break;
case PIVOT_VALUE: SAY("PIVOT_VALUE\n"); break;
case PIVOT_UNKNOWN: SAY("PIVOT_UNKNOWN\n"); break;
case PIVOT_PARAM: SAY("PIVOT_PARAM\n"); break;
default: SAY(cLRD "<UNKNOWN %u> (bad!)\n" cNOR, cur->type);
}
SAY(cGRA "%sState : " cNOR, indent);
switch (cur->state) {
case PSTATE_NONE: SAY(cLRD "PSTATE_NONE (bad!)\n" cNOR); break;
case PSTATE_PENDING: SAY("PSTATE_PENDING\n"); break;
case PSTATE_FETCH: SAY("PSTATE_FETCH\n"); break;
case PSTATE_TYPE_CHECK: SAY("PSTATE_TYPE_CHECK\n"); break;
case PSTATE_404_CHECK: SAY("PSTATE_404_CHECK\n"); break;
case PSTATE_IPS_CHECK: SAY("PSTATE_IPS_CHECK\n"); break;
case PSTATE_CHILD_INJECT: SAY("PSTATE_CHILD_INJECT\n"); break;
case PSTATE_CHILD_DICT: SAY("PSTATE_CHILD_DICT\n"); break;
case PSTATE_PAR_CHECK: SAY("PSTATE_PAR_CHECK\n"); break;
case PSTATE_PAR_INJECT: SAY("PSTATE_PAR_INJECT\n"); break;
case PSTATE_PAR_NUMBER: SAY("PSTATE_PAR_NUMBER\n"); break;
case PSTATE_PAR_DICT: SAY("PSTATE_PAR_DICT\n"); break;
case PSTATE_PAR_TRYLIST: SAY("PSTATE_PAR_TRYLIST\n"); break;
case PSTATE_DONE: SAY("PSTATE_DONE\n"); break;
default: SAY(cLRD "<UNKNOWN %u> (bad!)\n" cNOR,
cur->state);
}
if (cur->missing) {
if (cur->linked == 2)
SAY(cGRA "%sMissing : " cMGN "YES\n" cNOR, indent);
else
SAY(cGRA "%sMissing : " cLBL "YES (followed a dodgy link)\n" cNOR,
indent);
}
SAY(cGRA "%sFlags : " cNOR "linked %u, case %u/%u, fuzz_par %d, ips %u, "
"sigs %u, reqs %u\n", indent, cur->linked, cur->csens, cur->c_checked,
cur->fuzz_par, cur->uses_ips, cur->r404_cnt, cur->pending);
if (cur->req) {
url = serialize_path(cur->req, 1, 0);
SAY(cGRA "%sTarget : " cNOR "%s (" cYEL "%d" cNOR ")\n", indent, url,
cur->res ? cur->res->code : 0);
ck_free(url);
if (cur->res)
SAY(cGRA "%sMIME : " cNOR "%s -> %s ["
"%s:%s]\n", indent, cur->res->header_mime ? cur->res->header_mime :
(u8*)"-", cur->res->sniffed_mime ? cur->res->sniffed_mime : (u8*)"-",
cur->res->header_charset ? cur->res->header_charset : (u8*)"-",
cur->res->meta_charset ? cur->res->meta_charset : (u8*)"-");
}
if (cur->try_cnt) {
SAY(cGRA "%sTry : " cNOR, indent);
for (i=0;i<cur->try_cnt;i++)
SAY("%s%s", cur->try_list[i], (i == cur->try_cnt - 1) ? "" : ", ");
SAY("\n");
}
/* Dump issues. */
for (i=0;i<cur->issue_cnt;i++) {
if (cur->issue[i].req) url = serialize_path(cur->issue[i].req, 0, 0);
else url = ck_strdup((u8*)"[none]");
SAY(cGRA "%s-> Issue : " cNOR "type %d, extra '%s', URL: " cLGN "%s"
cNOR " (" cYEL "%u" cNOR ")\n", indent, cur->issue[i].type,
cur->issue[i].extra, url, cur->issue[i].res ? cur->issue[i].res->code
: 0);
ck_free(url);
}
ck_free(indent);
for (i=0;i<cur->child_cnt;i++)
dump_pivots(cur->child[i], nest + 1);
}
/* Cleans up pivot structure for memory debugging. */
static void dealloc_pivots(struct pivot_desc* cur) {
u32 i;
if (!cur) cur = &root_pivot;
if (cur->req) destroy_request(cur->req);
if (cur->res) destroy_response(cur->res);
ck_free(cur->name);
if (cur->try_cnt) {
for (i=0;i<cur->try_cnt;i++) ck_free(cur->try_list[i]);
ck_free(cur->try_list);
}
if (cur->issue) {
for (i=0;i<cur->issue_cnt;i++) {
ck_free(cur->issue[i].extra);
if (cur->issue[i].req) destroy_request(cur->issue[i].req);
if (cur->issue[i].res) destroy_response(cur->issue[i].res);
}
ck_free(cur->issue);
}
for (i=0;i<cur->child_cnt;i++)
dealloc_pivots(cur->child[i]);
ck_free(cur->child);
if (cur != &root_pivot) ck_free(cur);
}
/* Creates a new XSS location tag. */
u8* new_xss_tag(u8* prefix) {
static u8* ret;
if (ret) free(ret);
ret = __DFL_ck_alloc((prefix ? strlen((char*)prefix) : 0) + 32);
if (!scan_id) scan_id = R(999999) + 1;
sprintf((char*)ret, "%s>\">'>'\"<sfi%06uv%06u>",
prefix ? prefix : (u8*)"", cur_xss_id, scan_id);
return ret;
}
/* Registers last XSS tag along with a completed http_request */
void register_xss_tag(struct http_request* req) {
xss_req = ck_realloc(xss_req, (cur_xss_id + 1) *
(sizeof(struct http_request*)));
xss_req[cur_xss_id] = req_copy(req, 0, 1);
cur_xss_id++;
}
/* Gets the request that submitted the tag in the first place */
struct http_request* get_xss_request(u32 xid, u32 sid) {
if (sid != scan_id || xid >= cur_xss_id) return NULL;
return xss_req[xid];
}
/* Cleans up other database entries, for memory profiling purposes. */
void destroy_database() {
u32 i, kh;
dealloc_pivots(0);
ck_free(deny_urls);
ck_free(deny_strings);
ck_free(allow_urls);
ck_free(allow_domains);
ck_free(trust_domains);
for (kh=0;kh<WORD_HASH;kh++) {
for (i=0;i<keyword_cnt[kh];i++) ck_free(keyword[kh][i].word);
ck_free(keyword[kh]);
}
/* Extensions just referenced keyword[][].word entries. */
ck_free(extension);
for (i=0;i<guess_cnt;i++) ck_free(guess[i]);
ck_free(guess);
for (i=0;i<cur_xss_id;i++) destroy_request(xss_req[i]);
ck_free(xss_req);
}