Version 2.00b: Many improvements

- Minor bug fix to path parsing to avoid problems with /.$foo/, - Improved PHP error detection (courtesy of Niels Heinen), - Improved dictionary logic (courtesy of Niels Heinen) and new documentation of the same, - Improved support for file.ext keywords in the dictionary, - Fixed missing content_checks() in unknown_check_callback()(courtesy of Niels Heinen), - Improved an oversight in dictionary case sensitivity, - Improved pivots.txt data, - Support for supplementary read-only dictionaries (-W +dict), - Change to directory detection to work around a certain sneaky server behavior. - TODO: Revise dictionaries!!!
2011-08-09 16:04:52 -04:00 · 2011-08-09 16:04:52 -04:00 · 6b2d33edca
parent b199943c9d
commit 6b2d33edca
17 changed files with 630 additions and 326 deletions
--- a/28
+++ b/28
@ -1,3 +1,29 @@
+Version 2.00b:
+--------------
+
+  - Minor bug fix to path parsing to avoid problems with /.$foo/,
+
+  - Improved PHP error detection (courtesy of Niels Heinen),
+
+  - Improved dictionary logic (courtesy of Niels Heinen) and new
+    documentation of the same,
+
+  - Improved support for file.ext keywords in the dictionary,
+
+  - Fixed missing content_checks() in unknown_check_callback()
+    (courtesy of Niels Heinen),
+
+  - Improved an oversight in dictionary case sensitivity,
+
+  - Improved pivots.txt data,
+
+  - Support for supplementary read-only dictionaries (-W +dict),
+
+  - Change to directory detection to work around a certain sneaky
+    server behavior.
+
+  - TODO: Revise dictionaries!!!
+
 Version 1.94b:
 --------------

@ -9,7 +35,7 @@ Version 1.94b:
 Version 1.93b:
 --------------

-  - Major fix to URL XSS detection logic.
+  - Major fix to URL XSS detection logic (courtesy of Niels Heinen).

 Version 1.92b:
 --------------
--- a/2
+++ b/2
@ -20,7 +20,7 @@
 #

 PROGNAME   = skipfish
-VERSION    = 1.94b
+VERSION    = 2.00b

 OBJFILES   = http_client.c database.c crawler.c analysis.c report.c
 INCFILES   = alloc-inl.h string-inl.h debug.h types.h http_client.h \
--- a/10
+++ b/10
@ -429,7 +429,7 @@ $ ./skipfish -MEU -C "AuthCookie=value" -X /logout.aspx -o output_dir \
  http://www.example.com/

 Five-connection crawl, but no brute-force; pretending to be MSIE and and
-trusting example.com content):
+trusting example.com content:

 $ ./skipfish -m 5 -LV -W /dev/null -o output_dir -b ie -B example.com \
  http://www.example.com/
@ -506,7 +506,7 @@ know:
    a #define directive in config.h. Adding support for HTTPS proxying is
    more complicated, and still in the works.

-  * Scan resume option.
+  * Scan resume option, better runtime info.

  * Option to limit document sampling or save samples directly to disk.

@ -514,7 +514,11 @@ know:

  * Config file support.

-  * A database for banner / version checks?
+  * Scheduling and management web UI.
+
+  * QPS throttling and maximum scan time limit.
+
+  * A database for banner / version checks or other configurable rules?

 -------------------------------------
 9. Oy! Something went horribly wrong!
--- a/analysis.c
+++ b/analysis.c
@ -930,7 +930,7 @@ add_link:

      i = 0;

-      while ((ext = wordlist_get_extension(i++))) {
+      while ((ext = wordlist_get_extension(i++, 0))) {
        u32 ext_len = strlen((char*)ext);

        if (clean_len > ext_len + 2 &&
@ -2280,13 +2280,34 @@ static void check_for_stuff(struct http_request* req,
    return;
  }

-  if (strstr((char*)res->payload, "<b>Fatal error</b>:") ||
-      strstr((char*)res->payload, "<b>Parse error</b>:") ||
-      strstr((char*)res->payload, "</b> on line <b>")) {
-    problem(PROB_ERROR_POI, req, res, (u8*)"PHP error", req->pivot, 0);
+  if ((tmp = (u8*)strstr((char*)res->payload, " on line "))) {
+    u32 off = 512;
+
+    while (tmp - 1 > res->payload && !strchr("\r\n", tmp[-1])
+           && off--) tmp--;
+
+    if (off && (!prefix(tmp, "Warning: ") || !prefix(tmp, "Notice: ") ||
+        !prefix(tmp, "Fatal error: ")     || !prefix(tmp, "Parse error: ") ||
+        !prefix(tmp, "Deprecated: ")      || 
+        !prefix(tmp, "Strict Standards: ") ||
+        !prefix(tmp, "Catchable fatal error: "))) {
+      problem(PROB_ERROR_POI, req, res, (u8*)"PHP error (text)", req->pivot, 0);
      return;
    }

+    if (off && !prefix(tmp, "<b>") && (!prefix(tmp + 3, "Warning</b>: ") ||
+        !prefix(tmp + 3, "Notice</b>: ") || 
+        !prefix(tmp + 3, "Fatal error</b>: ") ||
+        !prefix(tmp + 3, "Parse error</b>: ") ||
+        !prefix(tmp + 3, "Deprecated</b>: ")   || 
+        !prefix(tmp + 3, "Strict Standards</b>: ") ||
+        !prefix(tmp + 3, "Catchable fatal error</b>: "))) {
+      problem(PROB_ERROR_POI, req, res, (u8*)"PHP error (HTML)", req->pivot, 0);
+      return;
+    }
+
+  }
+
  if (strstr((char*)res->payload, "<b>Warning</b>:  MySQL: ") ||
      strstr((char*)res->payload, "java.sql.SQLException") ||
      strstr((char*)res->payload, "You have an error in your SQL syntax; ")) {
@ -2326,12 +2347,26 @@ static void check_for_stuff(struct http_request* req,
  if (strstr((char*)sniffbuf, "<cross-domain-policy>")) {
    problem(PROB_FILE_POI, req, res, (u8*)
            "Flash cross-domain policy", req->pivot, 0);
+
+    /*
+      if (strstr((char*)res->payload, "domain=\"*\""))
+        problem(PROB_CROSS_WILD, req, res, (u8*)
+                "Cross-domain policy with wildcard rules", req->pivot, 0);
+     */
+
    return;
  }

  if (strstr((char*)sniffbuf, "<access-policy>")) {
    problem(PROB_FILE_POI, req, res, (u8*)"Silverlight cross-domain policy",
            req->pivot, 0);
+
+    /*
+      if (strstr((char*)res->payload, "uri=\"*\""))
+        problem(PROB_CROSS_WILD, req, res, (u8*)
+                "Cross-domain policy with wildcard rules", req->pivot, 0);
+     */
+
    return;
  }

--- a/config.h
+++ b/config.h
@ -29,7 +29,7 @@

 /* Define this to enable experimental HTTP proxy support, through the -J
   option in the command line. This mode will not work as expected for
-   HTTPS requests at this point. */
+   HTTPS requests at this time - sorry. */

 // #define PROXY_SUPPORT           1

--- a/crawler.c
+++ b/crawler.c
@ -354,7 +354,7 @@ static void secondary_ext_init(struct pivot_desc* pv, struct http_request* req,

  i = 0;

-  while ((ex = wordlist_get_extension(i))) {
+  while ((ex = wordlist_get_extension(i, 0))) {
    u8* tmp = ck_alloc(strlen((char*)base_name) + strlen((char*)ex) + 2);
    u32 c;

@ -382,6 +382,7 @@ static void secondary_ext_init(struct pivot_desc* pv, struct http_request* req,
      n->par.v[tpar] = tmp;

      n->user_val = 1;
+      n->with_ext = 1;

      memcpy(&n->same_sig, &res->sig, sizeof(struct http_sig));

@ -1814,6 +1815,7 @@ static void crawl_par_dict_init(struct pivot_desc* pv) {
  struct http_request* n;
  u8 *kw, *ex;
  u32 i, c;
+  u8 specific;

  /* Too many requests still pending, or already done? */

@ -1832,7 +1834,7 @@ restart_dict:
  i = 0;

  kw = (pv->pdic_guess ? wordlist_get_guess : wordlist_get_word)
-       (pv->pdic_cur_key);
+       (pv->pdic_cur_key, &specific);

  if (!kw) {

@ -1878,10 +1880,11 @@ restart_dict:

    /* Schedule probes for all extensions for the current word, but
       only if the original parameter contained '.' somewhere,
-       and only if string is not on the try list. */
+       and only if string is not on the try list. Special handling
+       for specific keywords with '.' inside. */

-    if (strchr((char*)TPAR(pv->req), '.'))
-      while (!no_fuzz_ext && (ex = wordlist_get_extension(i))) {
+    if (!no_fuzz_ext && strchr((char*)TPAR(pv->req), '.'))
+      while ((ex = wordlist_get_extension(i, specific))) {

        u8* tmp = ck_alloc(strlen((char*)kw) + strlen((char*)ex) + 2);

@ -1901,6 +1904,7 @@ restart_dict:
          ck_free(TPAR(n));
          TPAR(n) = tmp;
          n->callback = par_dict_callback;
+          n->with_ext = 1;
          pv->pdic_pending++;
          in_dict_init = 1;
          async_request(n);
@ -2333,6 +2337,7 @@ static u8 dir_404_callback(struct http_request* req,
    }

    memcpy(&req->pivot->r404[i], &res->sig, sizeof(struct http_sig));
+
    req->pivot->r404_cnt++;

    /* Is this a new signature not seen on parent? Notify if so,
@ -2379,7 +2384,7 @@ schedule_next:

    /* Aaand schedule all the remaining probes. */

-    while ((nk = wordlist_get_extension(cur_ext++))) {
+    while ((nk = wordlist_get_extension(cur_ext++, 0))) {
      u8* tmp = ck_alloc(strlen(BOGUS_FILE) + strlen((char*)nk) + 2);

      n = req_copy(RPREQ(req), req->pivot, 1);
@ -2388,6 +2393,7 @@ schedule_next:
      replace_slash(n, tmp);
      ck_free(tmp);
      n->callback = dir_404_callback;
+      n->with_ext = 1;
      n->user_val = 1;

      /* r404_pending is at least 1 to begin with, so this is safe
@ -2655,6 +2661,7 @@ static void crawl_dir_dict_init(struct pivot_desc* pv) {
  struct http_request* n;
  u8 *kw, *ex;
  u32 i, c;
+  u8 specific;

  /* Too many requests still pending, or already moved on to
     parametric tests? */
@ -2682,7 +2689,8 @@ static void crawl_dir_dict_init(struct pivot_desc* pv) {

 restart_dict:

-  kw = (pv->guess ? wordlist_get_guess : wordlist_get_word)(pv->cur_key);
+  kw = (pv->guess ? wordlist_get_guess : wordlist_get_word)
+       (pv->cur_key, &specific);

  if (!kw) {

@ -2739,11 +2747,13 @@ restart_dict:
    }

    /* Schedule probes for all extensions for the current word,
-       likewise. */
+       likewise. Make an exception for specific keywords that
+       already contain a period. */

    i = 0;

-    while (!no_fuzz_ext && (ex = wordlist_get_extension(i))) {
+    if (!no_fuzz_ext)
+      while ((ex = wordlist_get_extension(i, specific))) {

        u8* tmp = ck_alloc(strlen((char*)kw) + strlen((char*)ex) + 2);

@ -2761,6 +2771,7 @@ restart_dict:
          n = req_copy(pv->req, pv, 1);
          replace_slash(n, tmp);
          n->callback = dir_dict_callback;
+          n->with_ext = 1;
          pv->pending++;
          in_dict_init = 1;
          async_request(n);
@ -2917,6 +2928,7 @@ u8 fetch_unknown_callback(struct http_request* req, struct http_response* res) {
  n = req_copy(req, req->pivot, 1);
  set_value(PARAM_PATH, NULL, (u8*)"", -1, &n->par);
  n->callback = unknown_check_callback;
+  n->with_ext = req->with_ext;
  async_request(n);

  /* This is the initial callback, keep the response. */
@ -2974,13 +2986,34 @@ static u8 unknown_check_callback(struct http_request* req,

    }

-    if (par)
+    if (par) {
      for (i=0;i<par->r404_cnt;i++)
        if (same_page(&res->sig, &par->r404[i])) break;

+      /* Do not use extension-originating signatures for settling non-extension
+         cases. */
+
+      if (i && !req->with_ext) i = par->r404_cnt;
+
+    }
+
    if ((!par && res->code == 404) || (par && i != par->r404_cnt) || 
        (RPRES(req)->code < 300 && res->code >= 300 && RPRES(req)->pay_len)) {

+DEBUG("REASON X\n");
+if (par) DEBUG("same_404 = %d\n", i != par->r404_cnt);
+DEBUG("par = %p\n", par);
+if (par) DEBUG("par->r404_cnt = %d\n", par->r404_cnt);
+DEBUG("res->code = %d\n", res->code);
+DEBUG("parent code = %d\n", RPRES(req)->code);
+DEBUG("parent len = %d\n", RPRES(req)->pay_len);
+
+// (!par && res->code == 404) || - NIE
+// (par && i != par->r404_cnt) || - TAK
+// (RPRES(req)->code < 300 && res->code >= 300 && RPRES(req)->pay_len))
+
+
+
      req->pivot->type = PIVOT_FILE;

    } else {
@ -2999,6 +3032,11 @@ assume_dir:

      req->pivot->type = PIVOT_DIR;

+      /* Perform content checks before discarding the old payload. */
+
+      if (!same_page(&RPRES(req)->sig, &res->sig))
+        content_checks(RPREQ(req), RPRES(req));
+
      /* Replace original request, response with new data. */

      destroy_request(RPREQ(req));
--- a/database.c
+++ b/database.c
@ -57,11 +57,17 @@ u32 max_depth       = MAX_DEPTH,

 u8  dont_add_words;                     /* No auto dictionary building     */

+#define KW_SPECIFIC 0
+#define KW_GENERIC  1
+#define KW_GEN_AUTO 2
+
 struct kw_entry {
  u8* word;                             /* Keyword itself                  */
  u32 hit_cnt;                          /* Number of confirmed sightings   */
  u8  is_ext;                           /* Is an extension?                */
  u8  hit_already;                      /* Had its hit count bumped up?    */
+  u8  read_only;                        /* Read-only dictionary?           */
+  u8  class;                            /* KW_*                            */
  u32 total_age;                        /* Total age (in scan cycles)      */
  u32 last_age;                         /* Age since last hit              */
 };
@ -71,11 +77,19 @@ static struct kw_entry*

 static u32 keyword_cnt[WORD_HASH];      /* Per-bucket keyword counts       */

-static u8 **extension,                  /* Extension list                  */
-          **guess;                      /* Keyword candidate list          */
+struct ext_entry {
+  u32 bucket;
+  u32 index;
+};
+
+static struct ext_entry *extension,     /* Extension list                  */
+                        *sp_extension;
+
+static u8 **guess;                      /* Keyword candidate list          */

 u32 guess_cnt,                          /* Number of keyword candidates    */
    extension_cnt,                      /* Number of extensions            */
+    sp_extension_cnt,                   /* Number of specific extensions   */
    keyword_total_cnt,                  /* Current keyword count           */
    keyword_orig_cnt;                   /* At-boot keyword count           */

@ -818,7 +832,7 @@ static inline u32 hash_word(u8* str) {


 /* Adds a new keyword candidate to the global "guess" list. This
-   list is always case-insensitive. */
+   list is case-sensitive. */

 void wordlist_add_guess(u8* text) {
  u32 target, i, kh;
@ -830,7 +844,7 @@ void wordlist_add_guess(u8* text) {
  if (!text || !text[0] || strlen((char*)text) > MAX_WORD) return;

  for (i=0;i<guess_cnt;i++)
-    if (!strcasecmp((char*)text, (char*)guess[i])) return;
+    if (!strcmp((char*)text, (char*)guess[i])) return;

  kh = hash_word(text);

@ -853,10 +867,10 @@ void wordlist_add_guess(u8* text) {


 /* Adds a single, sanitized keyword to the list, or increases its hit count.
-   Keyword list is case-insensitive - first capitalization wins. */
+   Keyword list is case-sensitive. */

-static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits,
-                                    u32 total_age, u32 last_age) {
+static void wordlist_confirm_single(u8* text, u8 is_ext, u8 class, u8 read_only,
+                                    u32 add_hits, u32 total_age, u32 last_age) {
  u32 kh, i;

  if (!text || !text[0] || strlen((char*)text) > MAX_WORD) return;
@ -866,7 +880,7 @@ static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits,
  kh = hash_word(text);

  for (i=0;i<keyword_cnt[kh];i++)
-    if (!strcasecmp((char*)text, (char*)keyword[kh][i].word)) {
+    if (!strcmp((char*)text, (char*)keyword[kh][i].word)) {

      /* Known! Increase hit count, and if this is now
         tagged as an extension, add to extension list. */
@ -875,13 +889,19 @@ static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits,
        keyword[kh][i].hit_cnt    += add_hits;
        keyword[kh][i].hit_already = 1;
        keyword[kh][i].last_age    = 0;
+
+        if (!keyword[kh][i].read_only && read_only)
+          keyword[kh][i].read_only = 1;
+
      }

      if (!keyword[kh][i].is_ext && is_ext) {
        keyword[kh][i].is_ext = 1;

-        extension = ck_realloc(extension, (extension_cnt + 1) * sizeof(u8*));
-        extension[extension_cnt++] = keyword[kh][i].word;
+        extension = ck_realloc(extension, (extension_cnt + 1) * 
+                    sizeof(struct ext_entry));
+        extension[extension_cnt].bucket = kh;
+        extension[extension_cnt++].index = i;
      }

      return;
@ -896,6 +916,8 @@ static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits,

  keyword[kh][i].word      = ck_strdup(text);
  keyword[kh][i].is_ext    = is_ext;
+  keyword[kh][i].class     = class;
+  keyword[kh][i].read_only = read_only;
  keyword[kh][i].hit_cnt   = add_hits;
  keyword[kh][i].total_age = total_age;
  keyword[kh][i].last_age  = last_age;
@ -906,8 +928,21 @@ static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits,
  if (!total_age) keyword[kh][i].hit_already = 1;

  if (is_ext) {
-    extension = ck_realloc(extension, (extension_cnt + 1) * sizeof(u8*));
-    extension[extension_cnt++] = keyword[kh][i].word;
+
+    extension = ck_realloc(extension, (extension_cnt + 1) * 
+                sizeof(struct ext_entry));
+    extension[extension_cnt].bucket = kh;
+    extension[extension_cnt++].index = i;
+
+    if (class == KW_SPECIFIC) {
+
+      sp_extension = ck_realloc(sp_extension, (sp_extension_cnt + 1) * 
+                  sizeof(struct ext_entry));
+      sp_extension[sp_extension_cnt].bucket = kh;
+      sp_extension[sp_extension_cnt++].index = i;
+
+    }
+
  }

 }
@ -946,6 +981,18 @@ void wordlist_confirm_word(u8* text) {
    }
  }

+  /* If the format is foo.bar, check if the entire string is a known keyword. 
+     If yes, don't try to look up and add individual components. */
+
+  if (ppos != -1) {
+
+    u32 kh = hash_word(text);
+
+    for (i=0;i<keyword_cnt[kh];i++)
+      if (!strcasecmp((char*)text, (char*)keyword[kh][i].word)) return;
+
+  }
+
  /* Too many dots? Tokenize class paths and domains as individual keywords,
     still. */

@ -972,22 +1019,22 @@ void wordlist_confirm_word(u8* text) {
    if (tlen == 1 || tlen - ppos > 12) return;

    if (ppos && ppos != tlen - 1 && !isdigit(text[ppos] + 1)) {
-      wordlist_confirm_single(text + ppos + 1, 1, 1, 0, 0);
+      wordlist_confirm_single(text + ppos + 1, 1, KW_GEN_AUTO, 0, 1, 0, 0);
      text[ppos] = 0;
-      wordlist_confirm_single(text, 0, 1, 0, 0);
+      wordlist_confirm_single(text, 0, KW_GEN_AUTO, 0, 1, 0, 0);
      text[ppos] = '.';
      return;
    }

  }

-  wordlist_confirm_single(text, 0, 1, 0, 0);
+  wordlist_confirm_single(text, 0, KW_GEN_AUTO, 0, 1, 0, 0);
 }


 /* Returns wordlist item at a specified offset (NULL if no more available). */

-u8* wordlist_get_word(u32 offset) {
+u8* wordlist_get_word(u32 offset, u8* specific) {
  u32 cur_off = 0, kh;

  for (kh=0;kh<WORD_HASH;kh++) {
@ -997,32 +1044,42 @@ u8* wordlist_get_word(u32 offset) {

  if (kh == WORD_HASH) return NULL;

+  *specific = (keyword[kh][offset - cur_off].is_ext == 0 &&
+               keyword[kh][offset - cur_off].class == KW_SPECIFIC);
+
  return keyword[kh][offset - cur_off].word;
 }


 /* Returns keyword candidate at a specified offset (or NULL). */

-u8* wordlist_get_guess(u32 offset) {
+u8* wordlist_get_guess(u32 offset, u8* specific) {
  if (offset >= guess_cnt) return NULL;
+  *specific = 0;
  return guess[offset];
 }


 /* Returns extension at a specified offset (or NULL). */

-u8* wordlist_get_extension(u32 offset) {
+u8* wordlist_get_extension(u32 offset, u8 specific) {
+
+  if (!specific) {
    if (offset >= extension_cnt) return NULL;
-  return extension[offset];
+    return keyword[extension[offset].bucket][extension[offset].index].word;
+  }
+
+  if (offset >= sp_extension_cnt) return NULL;
+  return keyword[sp_extension[offset].bucket][sp_extension[offset].index].word;
 }


 /* Loads keywords from file. */

-void load_keywords(u8* fname, u32 purge_age) {
+void load_keywords(u8* fname, u8 read_only, u32 purge_age) {
  FILE* in;
  u32 hits, total_age, last_age, lines = 0;
-  u8 type;
+  u8 type[3];
  s32 fields;
  u8 kword[MAX_WORD + 1];
  char fmt[32];
@ -1036,19 +1093,28 @@ void load_keywords(u8* fname, u32 purge_age) {
    return;
  }

-  sprintf(fmt, "%%c %%u %%u %%u %%%u[^\x01-\x1f]", MAX_WORD);
+  sprintf(fmt, "%%2s %%u %%u %%u %%%u[^\x01-\x1f]", MAX_WORD);

-  while ((fields = fscanf(in, fmt, &type, &hits, &total_age, &last_age, kword))
+  while ((fields = fscanf(in, fmt, type, &hits, &total_age, &last_age, kword))
          == 5) {
+
+    u8 class = KW_GEN_AUTO;
+
+    if (type[0] != 'e' && type[0] != 'w')
+      FATAL("Wordlist '%s': bad keyword type in line %u.\n", fname, lines + 1);
+
+    if (type[1] == 's') class = KW_SPECIFIC; else
+    if (type[1] == 'g') class = KW_GENERIC;
+
    if (!purge_age || last_age < purge_age)
-      wordlist_confirm_single(kword, (type == 'e'), hits,
+      wordlist_confirm_single(kword, (type[0] == 'e'), class, read_only, hits,
                              total_age + 1, last_age + 1);
    lines++;
    fgetc(in); /* sink \n */
  }

  if (fields != -1 && fields != 5)
-    FATAL("Wordlist '%s': syntax error in line %u.\n", fname, lines + 1);
+    FATAL("Wordlist '%s': syntax error in line %u.\n", fname, lines);

  if (!lines)
    WARN("Wordlist '%s' contained no valid entries.", fname);
@ -1110,11 +1176,21 @@ void save_keywords(u8* fname) {
  }

  for (kh=0;kh<WORD_HASH;kh++)
-    for (i=0;i<keyword_cnt[kh];i++)
-      fprintf(out,"%c %u %u %u %s\n", keyword[kh][i].is_ext ? 'e' : 'w',
+    for (i=0;i<keyword_cnt[kh];i++) {
+      u8 class = '?';
+
+      if (keyword[kh][i].read_only) continue;
+
+      if (keyword[kh][i].class == KW_SPECIFIC) class = 's'; else
+      if (keyword[kh][i].class == KW_GENERIC) class = 'g';
+
+      fprintf(out,"%c%c %u %u %u %s\n", keyword[kh][i].is_ext ? 'e' : 'w',
+              class,
              keyword[kh][i].hit_cnt, keyword[kh][i].total_age,
              keyword[kh][i].last_age, keyword[kh][i].word);

+    }
+
  SAY(cLGN "[+] " cNOR "Wordlist '%s' updated (%u new words added).\n",
      fname, keyword_total_cnt - keyword_orig_cnt);

@ -1409,8 +1485,9 @@ void destroy_database() {
    ck_free(keyword[kh]);
  }

-  /* Extensions just referenced keyword[][].word entries. */
+  /* Extensions just referenced keyword[][] entries. */
  ck_free(extension);
+  ck_free(sp_extension);

  for (i=0;i<guess_cnt;i++) ck_free(guess[i]);
  ck_free(guess);
--- a/database.h
+++ b/database.h
@ -375,19 +375,19 @@ void wordlist_confirm_word(u8* text);

 /* Returns wordlist item at a specified offset (NULL if no more available). */

-u8* wordlist_get_word(u32 offset);
+u8* wordlist_get_word(u32 offset, u8* specific);

 /* Returns keyword candidate at a specified offset (or NULL). */

-u8* wordlist_get_guess(u32 offset);
+u8* wordlist_get_guess(u32 offset, u8* specific);

 /* Returns extension at a specified offset (or NULL). */

-u8* wordlist_get_extension(u32 offset);
+u8* wordlist_get_extension(u32 offset, u8 specific);

 /* Loads keywords from file. */

-void load_keywords(u8* fname, u32 purge_age);
+void load_keywords(u8* fname, u8 read_only, u32 purge_age);

 /* Saves all keywords to a file. */

--- a/dictionaries/README-FIRST
+++ b/dictionaries/README-FIRST
@ -3,11 +3,75 @@ This directory contains four alternative, hand-picked Skipfish dictionaries.
 PLEASE READ THIS FILE CAREFULLY BEFORE PICKING ONE. This is *critical* to
 getting good results in your work.

----------------
-Dictionary modes
----------------
+------------------------
+Key command-line options
+------------------------

-The basic modes you should be aware of (in order of request cost):
+The dictionary to be used by the tool can be specified with the -W option,
+and must conform to the format outlined at the end of this document. If you
+omit -W in the command line, 'skipfish.wl' is assumed. This file does not
+exist by default. That part is by design: THE SCANNER WILL MODIFY THE
+SUPPLIED FILE UNLESS SPECIFICALLY INSTRUCTED NOT TO.
+
+That's because the scanner automatically learns new keywords and extensions
+based on any links discovered during the scan, and on random sampling of 
+site contents. The information is consequently stored in the dictionary
+for future reuse, along with other bookkeeping information useful for
+determining which keywords perform well, and which ones don't.
+
+All this means that it is very important to maintain a separate dictionary
+for every separate set of unrelated target sites. Otherwise, undesirable
+interference will occur.
+
+With this out of the way, let's quickly review the options that may be used
+to fine-tune various aspects of dictionary handling:
+
+  -L      - do not automatically learn new keywords based on site content.
+
+            This option should not be normally used in most scanning
+            modes; if supplied, the scanner will not be able to discover
+            and leverage technology-specific terms and file extensions
+            unique to the architecture of the targeted site.
+
+  -G num  - change jar size for keyword candidates.
+
+            Up to <num> candidates are randomly selected from site
+            content, and periodically retried during brute-force checks;
+            when one of them results in a unique non-404 response, it is
+            promoted to the dictionary proper. Unsuccessful candidates are
+            gradually replaced with new picks, and then discarded at the
+            end of the scan. The default jar size is 256.
+
+  -V      - prevent the scanner from updating the dictionary file.
+
+            Normally, the primary read-write dictionary specified with the
+            -W option is updated at the end of the scan to add any newly
+            discovered keywords, and to update keyword usage stats. Using
+            this option eliminates this step.
+
+  -R num  - purge all dictionary entries that had no non-404 hits for
+            the last <num> scans.
+
+            This option prevents dictionary creep in repeated assessments,
+            but needs to be used with care: it will permanently nuke a
+            part of the dictionary!
+
+  -Y      - inhibit full ${filename}.${extension} brute-force.
+
+            In this mode, the scanner will only brute-force one component
+            at a time, trying all possible keywords without any extension,
+            and then trying to append extensions to any otherwise discovered
+            content.
+
+            This greatly improves scan times, but reduces coverage. Scan modes
+            2 and 3 shown in the next section make use of this flag.
+
+--------------
+Scanning modes
+--------------
+
+The basic dictionary-dependent modes you should be aware of (in order of the
+associated request cost):

 1) Orderly crawl with no DirBuster-like brute-force at all. In this mode, the
   scanner will not discover non-linked resources such as /admin,
@ -20,14 +84,15 @@ The basic modes you should be aware of (in order of request cost):

 2) Orderly scan with minimal extension brute-force. In this mode, the scanner
   will not discover resources such as /admin, but will discover cases such as
-   /index.php.old:
+   /index.php.old (once index.php itself is spotted during an orderly crawl):

   cp dictionaries/extensions-only.wl dictionary.wl
   ./skipfish -W dictionary.wl -Y [...other options...]

   This method is only slightly more request-intensive than #1, and therefore,
-   generally recommended in cases where time is of essence. The cost is about
-   100 requests per fuzzed location.
+   is a marginally better alternative in cases where time is of essence. It's
+   still not recommended for most uses. The cost is about 100 requests per
+   fuzzed location.

 3) Directory OR extension brute-force only. In this mode, the scanner will only
   try fuzzing the file name, or the extension, at any given time - but will
@ -61,39 +126,27 @@ The basic modes you should be aware of (in order of request cost):
   reasonably responsive servers; but it may be prohibitively expensive
   when dealing with very large or very slow sites.

-As should be obvious, the -W option points to a dictionary to be used; the
-scanner updates the file based on scan results, so please always make a
-target-specific copy - do not use the master file directly, or it may be
-polluted with keywords not relevant to other targets.
+----------------------------------
+Using separate master dictionaries
+----------------------------------

-Additional options supported by the aforementioned modes:
+A recently introduced feature allows you to load any number of read-only
+supplementary dictionaries in addition to the "main" read-write one (-W
+dictionary.wl).

-  -L      - do not automatically learn new keywords based on site content.
-            This option should not be normally used in most scanning
-            modes; *not* using it significantly improves the coverage of
-            minimal.wl.
+This is a convenient way to isolate (and be able to continually update) your
+customized top-level wordlist, whilst still acquiring site-specific data in
+a separate file. The following syntax may be used to accomplish this:

-  -G num  - specifies jar size for keyword candidates selected from the
-            content; up to <num> candidates are kept and tried during
-            brute-force checks; when one of them results in a unique
-            non-404 response, it is promoted to the dictionary proper.
+./skipfish -W initially_empty_site_specific_dict.wl -W +supplementary_dict1.wl \
+  -W +supplementary_dict2.wl [...other options...]

-  -V      - prevents the scanner from updating the dictionary file with
-            newly discovered keywords and keyword usage stats (i.e., all
-            new findings are discarded on exit).
+Only the main dictionary will be modified as a result of the scan, and only
+newly discovered site-specific keywords will be appended there.

-  -Y      - inhibits full ${filename}.${extension} brute-force: the scanner
-            will only brute-force one component at a time. This greatly
-            improves scan times, but reduces coverage. Modes 2 and 3
-            shown above make use of this flag.
-
-  -R num  - purges all dictionary entries that had no non-404 hits for
-            the last <num> scans. Prevents dictionary creep in repeated
-            assessments, but use with care!
-
-----------------------------
-More about dictionary design:
-----------------------------
+----------------------------
+More about dictionary design
+----------------------------

 Each dictionary may consist of a number of extensions, and a number of
 "regular" keywords. Extensions are considered just a special subset of the
@ -103,29 +156,74 @@ You can create custom dictionaries, conforming to this format:

 type hits total_age last_age keyword

-...where 'type' is either 'e' or 'w' (extension or wordlist); 'hits' is the
-total number of times this keyword resulted in a non-404 hit in all previous
-scans; 'total_age' is the number of scan cycles this word is in the dictionary;
-'last_age' is the number of scan cycles since the last 'hit'; and 'keyword' is
-the actual keyword.
+...where 'type' is either 'e' or 'w' (extension or keyword), followed by a
+qualifier (explained below); 'hits' is the total number of times this keyword
+resulted in a non-404 hit in all previous scans; 'total_age' is the number of scan
+cycles this word is in the dictionary; 'last_age' is the number of scan cycles
+since the last 'hit'; and 'keyword' is the actual keyword.

-Do not duplicate extensions as keywords - if you already have 'html' as an 'e'
-entry, there is no need to also create a 'w' one.
+Qualifiers alter the meaning of an entry in the following way:

-There must be no empty or malformed lines, comments in the wordlist file.
-Extension keywords must have no leading dot (e.g., 'exe', not '.exe'), and all
-keywords should be NOT url-encoded (e.g., 'Program Files', not 
-'Program%20Files'). No keyword should exceed 64 characters.
+  wg - generic keyword that is not associated with any specific server-side
+       technology. Examples include 'backup', 'accounting', or 'logs'. These
+       will be indiscriminately combined with every known extension (e.g.,
+       'backup.php') during the fuzzing process.

-If you omit -W in the command line, 'skipfish.wl' is assumed. This file does
-not exist by default; this is by design.
+  ws - technology-specific keyword that are unlikely to have a random
+       extension; for example, with 'cgi-bin', testing for 'cgi-bin.php' is
+       usually a waste of time. Keywords tagged this way will be combined only
+       with a small set of technology-agnostic extensions - e.g., 'cgi-bin.old'.

-The scanner will automatically learn new keywords and extensions based on any
-links discovered during the scan; and will also analyze pages and extract
-words to use as keyword candidates.
+       NOTE: Technology-specific keywords that in the real world, are always
+       paired with a single, specific extension, should be combined with said
+       extension in the 'ws' entry itself, rather than trying to accommodate
+       them with 'wg' rules. For example, 'MANIFEST.MF' is OK.

-Tread carefully; poor wordlists are one of the reasons why some web security
-scanners perform worse than expected. You will almost always be better off
-narrowing down or selectively extending the supplied set (and possibly
-contributing back your changes upstream!), than importing a giant wordlist
-scored elsewhere.
+  eg - generic extension that is not specific to any well-defined technology,
+       or may pop-up in administrator- or developer-created auxiliary content.
+       Examples include 'bak', 'old', 'txt', or 'log'.
+
+  es - technology-specific extension, such as 'php', or 'cgi', that are
+       unlikely to spontaneously accompany random 'ws' keywords.
+
+Skipfish leverages this distinction by only trying the following brute-force
+combinations:
+
+  /some/path/wg_keyword ('index')
+  /some/path/ws_keyword ('cgi-bin')
+  /some/path/wg_extension ('old')
+  /some/path/ws_extension ('php')
+
+  /some/path/wg_keyword.wg_extension ('index.old')
+  /some/path/wg_keyword.ws_extension ('index.php')
+
+  /some/path/ws_keyword.ws_extension ('cgi-bin.old')
+
+To decide between 'wg' and 'ws', consider if you are likely to ever encounter
+files such as ${this_word}.php or ${this_word}.class. If not, tag the keyword
+as 'ws'.
+
+Similarly, to decide between 'eg' and 'es', think about the possibility of
+encoutering cgi-bin.${this_ext} or formmail.${this_ext}. If it seems unlikely,
+choose 'es'.
+
+For your convenience, all legacy keywords and extensions, as well as any entries
+detected automatically, will be stored in the dictionary with a '?' qualifier.
+This is equivalent to 'g', and is meant to assist the user in reviewing and
+triaging any automatically acquired dictionary data.
+
+Other notes about dictionaries:
+
+  - Do not duplicate extensions as keywords - if you already have 'html' as an
+    'e' entry, there is no need to also create a 'w' one.
+
+  - There must be no empty or malformed lines, or comments, in the wordlist
+    file. Extension keywords must have no leading dot (e.g., 'exe', not '.exe'),
+    and all keywords should be NOT url-encoded (e.g., 'Program Files', not
+    'Program%20Files'). No keyword should exceed 64 characters.
+
+  - Tread carefully; poor wordlists are one of the reasons why some web security
+    scanners perform worse than expected. You will almost always be better off
+    narrowing down or selectively extending the supplied set (and possibly
+    contributing back your changes upstream!), than importing a giant wordlist
+    scored elsewhere.
--- a/dictionaries/complete.wl
+++ b/dictionaries/complete.wl
@ -16,6 +16,7 @@ e 1 1 1 class
 e 1 1 1 cnf
 e 1 1 1 conf
 e 1 1 1 config
+e 1 1 1 core
 e 1 1 1 cpp
 e 1 1 1 cs
 e 1 1 1 csproj
@ -587,7 +588,6 @@ w 1 1 1 cookies
 w 1 1 1 copies
 w 1 1 1 copy
 w 1 1 1 copyright
-w 1 1 1 core
 w 1 1 1 corp
 w 1 1 1 corpo
 w 1 1 1 corporate
--- a/dictionaries/extensions-only.wl
+++ b/dictionaries/extensions-only.wl
@ -16,6 +16,7 @@ e 1 1 1 class
 e 1 1 1 cnf
 e 1 1 1 conf
 e 1 1 1 config
+e 1 1 1 core
 e 1 1 1 cpp
 e 1 1 1 cs
 e 1 1 1 csproj
--- a/dictionaries/medium.wl
+++ b/dictionaries/medium.wl
@ -11,6 +11,7 @@ e 1 1 1 class
 e 1 1 1 cnf
 e 1 1 1 conf
 e 1 1 1 config
+e 1 1 1 core
 e 1 1 1 cpp
 e 1 1 1 csproj
 e 1 1 1 csv
@ -556,7 +557,6 @@ w 1 1 1 cookies
 w 1 1 1 copies
 w 1 1 1 copy
 w 1 1 1 copyright
-w 1 1 1 core
 w 1 1 1 corp
 w 1 1 1 corpo
 w 1 1 1 corporate
--- a/http_client.c
+++ b/http_client.c
@ -565,6 +565,22 @@ void tokenize_path(u8* str, struct http_request* req, u8 add_slash) {
      value = url_decode_token(cur + !first_el, next_seg - !first_el, 0);
    }

+    /* If the extracted segment is just '.' or '..', but is followed by
+       something else than '/', skip one separator. */
+
+    if (!name && cur[next_seg] && cur[next_seg] != '/' && 
+        (!strcmp((char*)value, ".") || !strcmp((char*)value, ".."))) {
+
+      next_seg = strcspn((char*)cur + next_seg + 1, "/;,!$?#") + next_seg + 1,
+
+      ck_free(name);
+      ck_free(value);
+
+      value = url_decode_token(cur + !first_el, next_seg - !first_el, 0);
+
+    }
+
+
    switch (first_el ? '/' : *cur) {

      case ';': set_value(PARAM_PATH_S, name, value, -1, &req->par); break;
--- a/http_client.h
+++ b/http_client.h
@ -97,12 +97,12 @@ struct http_request {
  u16 port;                     /* Port number to connect to    */

  u8* orig_url;                 /* Copy of the original URL     */
-
  struct param_array par;       /* Parameters, headers, cookies */

  struct pivot_desc *pivot;     /* Pivot descriptor             */

  u32 user_val;                 /* Can be used freely           */
+  u8 with_ext;                  /* Extension-based probe?       */

  u8 (*callback)(struct http_request*, struct http_response*);
                                /* Callback to invoke when done */
--- a/report.c
+++ b/report.c
@ -303,7 +303,7 @@ static void compute_counts(struct pivot_desc* pv) {
 /* Helper to JS-escape data. Static buffer, will be destroyed on
   subsequent calls. */

-static inline u8* js_escape(u8* str) {
+static inline u8* js_escape(u8* str, u8 sp) {
  u32 len;
  static u8* ret;
  u8* opos;
@ -316,7 +316,7 @@ static inline u8* js_escape(u8* str) {
  opos = ret = __DFL_ck_alloc(len * 4 + 1);

  while (len--) {
-    if (*str > 0x1f && *str < 0x80 && !strchr("<>\\'\"", *str)) {
+    if (*str > (sp ? 0x20 : 0x1f) && *str < 0x80 && !strchr("<>\\'\"", *str)) {
      *(opos++) = *(str++);
    } else {
      sprintf((char*)opos, "\\x%02x", *(str++));
@ -343,7 +343,7 @@ static void output_scan_info(u64 scan_time, u32 seed) {
  if (!f) PFATAL("Cannot open 'summary.js'");

  fprintf(f, "var sf_version = '%s';\n", VERSION);
-  fprintf(f, "var scan_date  = '%s';\n", js_escape(ct));
+  fprintf(f, "var scan_date  = '%s';\n", js_escape(ct, 0));
  fprintf(f, "var scan_seed  = '0x%08x';\n", seed);
  fprintf(f, "var scan_ms    = %llu;\n", (long long)scan_time);

@ -370,12 +370,12 @@ static void describe_res(FILE* f, struct http_response* res) {
    case STATE_OK:
      fprintf(f, "'fetched': true, 'code': %u, 'len': %u, 'decl_mime': '%s', ",
                 res->code, res->pay_len,
-                 js_escape(res->header_mime));
+                 js_escape(res->header_mime, 0));

      fprintf(f, "'sniff_mime': '%s', 'cset': '%s'", 
                 res->sniffed_mime ? res->sniffed_mime : (u8*)"[none]",
                 js_escape(res->header_charset ? res->header_charset 
-                 : res->meta_charset));
+                 : res->meta_charset, 0));
      break;

    case STATE_DNSERR:
@ -514,18 +514,18 @@ static void output_crawl_tree(struct pivot_desc* pv) {

    fprintf(f, "  { 'dupe': %s, 'type': %u, 'name': '%s%s",
            pv->child[i]->dupe ? "true" : "false",
-            pv->child[i]->type, js_escape(pv->child[i]->name),
+            pv->child[i]->type, js_escape(pv->child[i]->name, 0),
            (pv->child[i]->fuzz_par == -1 || pv->child[i]->type == PIVOT_VALUE)
            ? (u8*)"" : (u8*)"=");

    fprintf(f, "%s', 'dir': '%s', 'linked': %d, ",
            (pv->child[i]->fuzz_par == -1 || pv->child[i]->type == PIVOT_VALUE)
            ? (u8*)"" :
-            js_escape(pv->child[i]->req->par.v[pv->child[i]->fuzz_par]),
+            js_escape(pv->child[i]->req->par.v[pv->child[i]->fuzz_par], 0),
            tmp, pv->child[i]->linked);

    p = serialize_path(pv->child[i]->req, 1, 1);
-    fprintf(f, "'url': '%s', ", js_escape(p));
+    fprintf(f, "'url': '%s', ", js_escape(p, 0));
    ck_free(p);

    describe_res(f, pv->child[i]->res);
@ -557,7 +557,7 @@ static void output_crawl_tree(struct pivot_desc* pv) {

    fprintf(f, "  { 'severity': %u, 'type': %u, 'extra': '%s', ",
            PSEV(pv->issue[i].type) - 1, pv->issue[i].type,
-            pv->issue[i].extra ? js_escape(pv->issue[i].extra) : (u8*)"");
+            pv->issue[i].extra ? js_escape(pv->issue[i].extra, 0) : (u8*)"");

    describe_res(f, pv->issue[i].res);

@ -658,7 +658,7 @@ static void output_summary_views() {
      save_req_res(m_samp[i].req[c], m_samp[i].res[c], 0);
      if (chdir("..")) PFATAL("chdir unexpectedly fails!");
      fprintf(f, "    { 'url': '%s', 'dir': '%s/%s', 'linked': %d, 'len': %d"
-              " }%s\n", js_escape(p), tmp, tmp2,
+              " }%s\n", js_escape(p, 0), tmp, tmp2,
              m_samp[i].req[c]->pivot->linked, m_samp[i].res[c]->pay_len,
              (c == use_samp - 1) ? " ]" : ",");
      ck_free(p);
@ -693,9 +693,9 @@ static void output_summary_views() {
      if (chdir((char*)tmp2)) PFATAL("chdir unexpectedly fails!");
      save_req_res(i_samp[i].i[c]->req, i_samp[i].i[c]->res, 0);
      if (chdir("..")) PFATAL("chdir unexpectedly fails!");
-      fprintf(f, "    { 'url': '%s', ", js_escape(p));
+      fprintf(f, "    { 'url': '%s', ", js_escape(p, 0));
      fprintf(f, "'extra': '%s', 'dir': '%s/%s' }%s\n", 
-              i_samp[i].i[c]->extra ? js_escape(i_samp[i].i[c]->extra) : 
+              i_samp[i].i[c]->extra ? js_escape(i_samp[i].i[c]->extra, 0) : 
              (u8*)"", tmp, tmp2, 
              (c == use_samp - 1) ? " ]" : ",");
      ck_free(p);
@ -763,10 +763,12 @@ static void save_pivots(FILE* f, struct pivot_desc* cur) {
    u8* url = serialize_path(cur->req, 1, 1);

    fprintf(f, "%s %s ", cur->req->method ? cur->req->method : (u8*)"GET",
-           js_escape(url));
+           js_escape(url, 0));

    ck_free(url);

+    fprintf(f, "name=%s ", js_escape(cur->name, 1));
+
    switch (cur->type) {
      case PIVOT_SERV:     fprintf(f, "type=serv "); break;
      case PIVOT_DIR:      fprintf(f, "type=dir "); break;
@ -785,7 +787,8 @@ static void save_pivots(FILE* f, struct pivot_desc* cur) {
    }

    if (cur->res)
-      fprintf(f, "dup=%u %scode=%u len=%u notes=%u\n", cur->dupe,
+      fprintf(f, "dup=%u %s%scode=%u len=%u notes=%u\n", cur->dupe,
+             cur->bogus_par ? "bogus " : "",
             cur->missing ? "returns_404 " : "",
             cur->res->code, cur->res->pay_len, cur->issue_cnt);
    else
--- a/skipfish.1
+++ b/skipfish.1
@ -83,10 +83,6 @@ do not parse HTML and other documents to find new links
 .B \-o dir
 write output to specified directory (required)
 .TP
-.B \-J
-be less noisy about MIME / charset mismatches on probably
-static content
-.TP
 .B \-M
 log warnings about mixed content or non-SSL password forms
 .TP
@ -147,6 +143,9 @@ timeout on idle HTTP connections (default: 10 s)
 .TP
 .B \-s s_limit
 response size limit (default: 200000 B)
+.TP
+.B \-e
+do not keep binary responses for reporting

 .TP
 .B \-h, \-\-help
--- a/skipfish.c
+++ b/skipfish.c
@ -239,7 +239,7 @@ int main(int argc, char** argv) {
  u32 loop_cnt = 0, purge_age = 0, seed;
  u8  dont_save_words = 0, show_once = 0, be_quiet = 0, display_mode = 0,
      has_fake = 0;
-  u8 *wordlist = (u8*)DEF_WORDLIST, *output_dir = NULL;
+  u8 *wordlist = NULL, *output_dir = NULL;

  struct termios term;
  struct timeval tv;
@ -421,7 +421,12 @@ int main(int argc, char** argv) {
        break;

      case 'W':
+        if (optarg[0] == '+') load_keywords((u8*)optarg + 1, 1, 0);
+        else {
+          if (wordlist)
+            FATAL("Only one -W parameter permitted (unless '+' used).");
          wordlist = (u8*)optarg;
+        }
        break;

      case 'b':
@ -526,7 +531,9 @@ int main(int argc, char** argv) {
  if (max_connections < max_conn_host)
    max_connections = max_conn_host;

-  load_keywords((u8*)wordlist, purge_age);
+  if (!wordlist) wordlist = (u8*)DEF_WORDLIST;
+
+  load_keywords(wordlist, 0, purge_age);

  /* Schedule all URLs in the command line for scanning. */