From 6b2d33edca15d948d89f16ec49c1761a66395313 Mon Sep 17 00:00:00 2001 From: Steve Pinkham Date: Tue, 9 Aug 2011 16:04:52 -0400 Subject: [PATCH] Version 2.00b: Many improvements - Minor bug fix to path parsing to avoid problems with /.$foo/, - Improved PHP error detection (courtesy of Niels Heinen), - Improved dictionary logic (courtesy of Niels Heinen) and new documentation of the same, - Improved support for file.ext keywords in the dictionary, - Fixed missing content_checks() in unknown_check_callback()(courtesy of Niels Heinen), - Improved an oversight in dictionary case sensitivity, - Improved pivots.txt data, - Support for supplementary read-only dictionaries (-W +dict), - Change to directory detection to work around a certain sneaky server behavior. - TODO: Revise dictionaries!!! --- ChangeLog | 28 ++- Makefile | 2 +- README | 342 ++++++++++++++++---------------- analysis.c | 47 ++++- config.h | 4 +- crawler.c | 100 +++++++--- database.c | 135 ++++++++++--- database.h | 8 +- dictionaries/README-FIRST | 216 ++++++++++++++------ dictionaries/complete.wl | 2 +- dictionaries/extensions-only.wl | 1 + dictionaries/medium.wl | 2 +- http_client.c | 16 ++ http_client.h | 2 +- report.c | 31 +-- skipfish.1 | 7 +- skipfish.c | 13 +- 17 files changed, 630 insertions(+), 326 deletions(-) diff --git a/ChangeLog b/ChangeLog index 78d2294..5e7548b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,29 @@ +Version 2.00b: +-------------- + + - Minor bug fix to path parsing to avoid problems with /.$foo/, + + - Improved PHP error detection (courtesy of Niels Heinen), + + - Improved dictionary logic (courtesy of Niels Heinen) and new + documentation of the same, + + - Improved support for file.ext keywords in the dictionary, + + - Fixed missing content_checks() in unknown_check_callback() + (courtesy of Niels Heinen), + + - Improved an oversight in dictionary case sensitivity, + + - Improved pivots.txt data, + + - Support for supplementary read-only dictionaries (-W +dict), + + - Change to directory detection to work around a certain sneaky + server behavior. + + - TODO: Revise dictionaries!!! + Version 1.94b: -------------- @@ -9,7 +35,7 @@ Version 1.94b: Version 1.93b: -------------- - - Major fix to URL XSS detection logic. + - Major fix to URL XSS detection logic (courtesy of Niels Heinen). Version 1.92b: -------------- diff --git a/Makefile b/Makefile index 0544162..bdab0ee 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ # PROGNAME = skipfish -VERSION = 1.94b +VERSION = 2.00b OBJFILES = http_client.c database.c crawler.c analysis.c report.c INCFILES = alloc-inl.h string-inl.h debug.h types.h http_client.h \ diff --git a/README b/README index f2de531..8def8c9 100644 --- a/README +++ b/README @@ -12,30 +12,30 @@ skipfish - web application security scanner 1. What is skipfish? -------------------- -Skipfish is an active web application security reconnaissance tool. It -prepares an interactive sitemap for the targeted site by carrying out a -recursive crawl and dictionary-based probes. The resulting map is then -annotated with the output from a number of active (but hopefully -non-disruptive) security checks. The final report generated by the tool is -meant to serve as a foundation for professional web application security +Skipfish is an active web application security reconnaissance tool. It +prepares an interactive sitemap for the targeted site by carrying out a +recursive crawl and dictionary-based probes. The resulting map is then +annotated with the output from a number of active (but hopefully +non-disruptive) security checks. The final report generated by the tool is +meant to serve as a foundation for professional web application security assessments. ------------------------------------------------- 2. Why should I bother with this particular tool? ------------------------------------------------- -A number of commercial and open source tools with analogous functionality is -readily available (e.g., Nikto, Nessus); stick to the one that suits you -best. That said, skipfish tries to address some of the common problems +A number of commercial and open source tools with analogous functionality is +readily available (e.g., Nikto, Nessus); stick to the one that suits you +best. That said, skipfish tries to address some of the common problems associated with web security scanners. Specific advantages include: - * High performance: 500+ requests per second against responsive Internet + * High performance: 500+ requests per second against responsive Internet targets, 2000+ requests per second on LAN / MAN networks, and 7000+ requests against local instances have been observed, with a very modest CPU, network, and memory footprint. This can be attributed to: - * Multiplexing single-thread, fully asynchronous network I/O and data - processing model that eliminates memory management, scheduling, and IPC + * Multiplexing single-thread, fully asynchronous network I/O and data + processing model that eliminates memory management, scheduling, and IPC inefficiencies present in some multi-threaded clients. * Advanced HTTP/1.1 features such as range requests, content compression, @@ -45,8 +45,8 @@ associated with web security scanners. Specific advantages include: * Smart response caching and advanced server behavior heuristics are used to minimize unnecessary traffic. - * Performance-oriented, pure C implementation, including a custom - HTTP stack. + * Performance-oriented, pure C implementation, including a custom + HTTP stack. * Ease of use: skipfish is highly adaptive and reliable. The scanner features: @@ -60,34 +60,34 @@ associated with web security scanners. Specific advantages include: * Automatic wordlist construction based on site content analysis. * Probabilistic scanning features to allow periodic, time-bound assessments - of arbitrarily complex sites. + of arbitrarily complex sites. - * Well-designed security checks: the tool is meant to provide accurate + * Well-designed security checks: the tool is meant to provide accurate and meaningful results: - * Handcrafted dictionaries offer excellent coverage and permit thorough + * Handcrafted dictionaries offer excellent coverage and permit thorough $keyword.$extension testing in a reasonable timeframe. * Three-step differential probes are preferred to signature checks for detecting vulnerabilities. - * Ratproxy-style logic is used to spot subtle security problems: - cross-site request forgery, cross-site script inclusion, mixed content, + * Ratproxy-style logic is used to spot subtle security problems: + cross-site request forgery, cross-site script inclusion, mixed content, issues MIME- and charset mismatches, incorrect caching directives, etc. - * Bundled security checks are designed to handle tricky scenarios: - stored XSS (path, parameters, headers), blind SQL or XML injection, + * Bundled security checks are designed to handle tricky scenarios: + stored XSS (path, parameters, headers), blind SQL or XML injection, or blind shell injection. * Report post-processing drastically reduces the noise caused by any - remaining false positives or server gimmicks by identifying repetitive - patterns. + remaining false positives or server gimmicks by identifying repetitive + patterns. -That said, skipfish is not a silver bullet, and may be unsuitable for certain -purposes. For example, it does not satisfy most of the requirements outlined -in WASC Web Application Security Scanner Evaluation Criteria (some of them on -purpose, some out of necessity); and unlike most other projects of this type, -it does not come with an extensive database of known vulnerabilities for +That said, skipfish is not a silver bullet, and may be unsuitable for certain +purposes. For example, it does not satisfy most of the requirements outlined +in WASC Web Application Security Scanner Evaluation Criteria (some of them on +purpose, some out of necessity); and unlike most other projects of this type, +it does not come with an extensive database of known vulnerabilities for banner-type checks. ----------------------------------------------------- @@ -104,7 +104,7 @@ A rough list of the security checks offered by the tool is outlined below. * Server-side XML / XPath injection (including blind vectors). * Format string vulnerabilities. * Integer overflow vulnerabilities. - * Locations accepting HTTP PUT. + * Locations accepting HTTP PUT. * Medium risk flaws (potentially leading to data compromise): @@ -121,7 +121,7 @@ A rough list of the security checks offered by the tool is outlined below. * Generic MIME types on renderables. * Incorrect or missing charsets on renderables. * Conflicting MIME / charset info on renderables. - * Bad caching directives on cookie setting responses. + * Bad caching directives on cookie setting responses. * Low risk issues (limited impact or low specificity): @@ -135,7 +135,7 @@ A rough list of the security checks offered by the tool is outlined below. * HTML forms with no XSRF protection. * Self-signed SSL certificates. * SSL certificate host name mismatches. - * Bad caching directives on less sensitive content. + * Bad caching directives on less sensitive content. * Internal warnings: @@ -144,7 +144,7 @@ A rough list of the security checks offered by the tool is outlined below. * Failed 404 behavior checks. * IPS filtering detected. * Unexpected response variations. - * Seemingly misclassified crawl nodes. + * Seemingly misclassified crawl nodes. * Non-specific informational entries: @@ -170,14 +170,14 @@ A rough list of the security checks offered by the tool is outlined below. * Generic MIME type on less significant content. * Incorrect or missing charset on less significant content. * Conflicting MIME / charset information on less significant content. - * OGNL-like parameter passing conventions. + * OGNL-like parameter passing conventions. -Along with a list of identified issues, skipfish also provides summary -overviews of document types and issue types found; and an interactive -sitemap, with nodes discovered through brute-force denoted in a distinctive +Along with a list of identified issues, skipfish also provides summary +overviews of document types and issue types found; and an interactive +sitemap, with nodes discovered through brute-force denoted in a distinctive way. -NOTE: As a conscious design decision, skipfish will not redundantly complain +NOTE: As a conscious design decision, skipfish will not redundantly complain about highly non-specific issues, including but not limited to: * Non-httponly or non-secure cookies, @@ -186,51 +186,51 @@ about highly non-specific issues, including but not limited to: * Filesystem path disclosure in error messages, * Server of framework version disclosure, * Servers supporting TRACE or OPTIONS requests, - * Mere presence of certain technologies, such as WebDAV. + * Mere presence of certain technologies, such as WebDAV. -Most of these aspects are easy to inspect in a report if so desired - for -example, all the HTML forms are listed separately, so are new cookies or -interesting HTTP headers - and the expectation is that the auditor may opt to -make certain design recommendations based on this data where appropriate. +Most of these aspects are easy to inspect in a report if so desired - for +example, all the HTML forms are listed separately, so are new cookies or +interesting HTTP headers - and the expectation is that the auditor may opt to +make certain design recommendations based on this data where appropriate. That said, these occurrences are not highlighted as a specific security flaw. ----------------------------------------------------------- 4. All right, I want to try it out. What do I need to know? ----------------------------------------------------------- -First and foremost, please do not be evil. Use skipfish only against services +First and foremost, please do not be evil. Use skipfish only against services you own, or have a permission to test. -Keep in mind that all types of security testing can be disruptive. Although -the scanner is designed not to carry out malicious attacks, it may -accidentally interfere with the operations of the site. You must accept the -risk, and plan accordingly. Run the scanner against test instances where +Keep in mind that all types of security testing can be disruptive. Although +the scanner is designed not to carry out malicious attacks, it may +accidentally interfere with the operations of the site. You must accept the +risk, and plan accordingly. Run the scanner against test instances where feasible, and be prepared to deal with the consequences if things go wrong. -Also note that the tool is meant to be used by security professionals, and is -experimental in nature. It may return false positives or miss obvious -security problems - and even when it operates perfectly, it is simply not -meant to be a point-and-click application. Do not take its output at face +Also note that the tool is meant to be used by security professionals, and is +experimental in nature. It may return false positives or miss obvious +security problems - and even when it operates perfectly, it is simply not +meant to be a point-and-click application. Do not take its output at face value. -Running the tool against vendor-supplied demo sites is not a good way to -evaluate it, as they usually approximate vulnerabilities very imperfectly; we +Running the tool against vendor-supplied demo sites is not a good way to +evaluate it, as they usually approximate vulnerabilities very imperfectly; we made no effort to accommodate these cases. -Lastly, the scanner is simply not designed for dealing with rogue and -misbehaving HTTP servers - and offers no guarantees of safe (or sane) +Lastly, the scanner is simply not designed for dealing with rogue and +misbehaving HTTP servers - and offers no guarantees of safe (or sane) behavior there. -------------------------- 5. How to run the scanner? -------------------------- -To compile it, simply unpack the archive and try make. Chances are, you will +To compile it, simply unpack the archive and try make. Chances are, you will need to install libidn first. -Next, you need to copy the desired dictionary file from dictionaries/ to -skipfish.wl. Please read dictionaries/README-FIRST carefully to make the -right choice. This step has a profound impact on the quality of scan results +Next, you need to copy the desired dictionary file from dictionaries/ to +skipfish.wl. Please read dictionaries/README-FIRST carefully to make the +right choice. This step has a profound impact on the quality of scan results later on. Once you have the dictionary selected, you can try: @@ -243,16 +243,16 @@ the following syntax: $ ./skipfish -o output_dir @../path/to/url_list.txt -The tool will display some helpful stats while the scan is in progress. You +The tool will display some helpful stats while the scan is in progress. You can also switch to a list of in-flight HTTP requests by pressing return. -In the example above, skipfish will scan the entire www.example.com -(including services on other ports, if linked to from the main page), and -write a report to output_dir/index.html. You can then view this report with -your favorite browser (JavaScript must be enabled; and because of recent -file:/// security improvements in certain browsers, you might need to access -results over HTTP). The index.html file is static; actual results are stored -as a hierarchy of JSON files, suitable for machine processing or different +In the example above, skipfish will scan the entire www.example.com +(including services on other ports, if linked to from the main page), and +write a report to output_dir/index.html. You can then view this report with +your favorite browser (JavaScript must be enabled; and because of recent +file:/// security improvements in certain browsers, you might need to access +results over HTTP). The index.html file is static; actual results are stored +as a hierarchy of JSON files, suitable for machine processing or different presentation frontends if needs be. In addition, a list of all the discovered URLs will be saved to a single file, pivots.txt, for easy postprocessing. @@ -262,40 +262,40 @@ report will be non-destructively annotated by adding red background to all new or changed nodes; and blue background to all new or changed issues found. -Some sites may require authentication; for simple HTTP credentials, you can +Some sites may require authentication; for simple HTTP credentials, you can try: $ ./skipfish -A user:pass ...other parameters... -Alternatively, if the site relies on HTTP cookies instead, log in in your -browser or using a simple curl script, and then provide skipfish with a +Alternatively, if the site relies on HTTP cookies instead, log in in your +browser or using a simple curl script, and then provide skipfish with a session cookie: $ ./skipfish -C name=val ...other parameters... Other session cookies may be passed the same way, one per each -C option. -Certain URLs on the site may log out your session; you can combat this in two -ways: by using the -N option, which causes the scanner to reject attempts to -set or delete cookies; or with the -X parameter, which prevents matching URLs +Certain URLs on the site may log out your session; you can combat this in two +ways: by using the -N option, which causes the scanner to reject attempts to +set or delete cookies; or with the -X parameter, which prevents matching URLs from being fetched: $ ./skipfish -X /logout/logout.aspx ...other parameters... -The -X option is also useful for speeding up your scans by excluding /icons/, -/doc/, /manuals/, and other standard, mundane locations along these lines. In +The -X option is also useful for speeding up your scans by excluding /icons/, +/doc/, /manuals/, and other standard, mundane locations along these lines. In general, you can use -X and -I (only spider URLs matching a substring) to -limit the scope of a scan any way you like - including restricting it only to +limit the scope of a scan any way you like - including restricting it only to a specific protocol and port: $ ./skipfish -I http://example.com:1234/ ...other parameters... -A related function, -K, allows you to specify parameter names not to fuzz +A related function, -K, allows you to specify parameter names not to fuzz (useful for applications that put session IDs in the URL, to minimize noise). -Another useful scoping option is -D - allowing you to specify additional -hosts or domains to consider in-scope for the test. By default, all hosts -appearing in the command-line URLs are added to the list - but you can use -D +Another useful scoping option is -D - allowing you to specify additional +hosts or domains to consider in-scope for the test. By default, all hosts +appearing in the command-line URLs are added to the list - but you can use -D to broaden these rules, for example: $ ./skipfish -D test2.example.com -o output-dir http://test1.example.com/ @@ -304,61 +304,61 @@ $ ./skipfish -D test2.example.com -o output-dir http://test1.example.com/ $ ./skipfish -D .example.com -o output-dir http://test1.example.com/ -In some cases, you do not want to actually crawl a third-party domain, but -you trust the owner of that domain enough not to worry about cross-domain -content inclusion from that location. To suppress warnings, you can use the +In some cases, you do not want to actually crawl a third-party domain, but +you trust the owner of that domain enough not to worry about cross-domain +content inclusion from that location. To suppress warnings, you can use the -B option, for example: -$ ./skipfish -B .google-analytics.com -B .googleapis.com ...other +$ ./skipfish -B .google-analytics.com -B .googleapis.com ...other parameters... -By default, skipfish sends minimalistic HTTP headers to reduce the amount of -data exchanged over the wire; some sites examine User-Agent strings or header -ordering to reject unsupported clients, however. In such a case, you can use +By default, skipfish sends minimalistic HTTP headers to reduce the amount of +data exchanged over the wire; some sites examine User-Agent strings or header +ordering to reject unsupported clients, however. In such a case, you can use -b ie, -b ffox, or -b phone to mimic one of the two popular browsers (or iPhone). -When it comes to customizing your HTTP requests, you can also use the -H -option to insert any additional, non-standard headers; or -F to define a -custom mapping between a host and an IP (bypassing the resolver). The latter +When it comes to customizing your HTTP requests, you can also use the -H +option to insert any additional, non-standard headers; or -F to define a +custom mapping between a host and an IP (bypassing the resolver). The latter feature is particularly useful for not-yet-launched or legacy services. -Some sites may be too big to scan in a reasonable timeframe. If the site -features well-defined tarpits - for example, 100,000 nearly identical user -profiles as a part of a social network - these specific locations can be -excluded with -X or -S. In other cases, you may need to resort to other -settings: -d limits crawl depth to a specified number of subdirectories; -c +Some sites may be too big to scan in a reasonable timeframe. If the site +features well-defined tarpits - for example, 100,000 nearly identical user +profiles as a part of a social network - these specific locations can be +excluded with -X or -S. In other cases, you may need to resort to other +settings: -d limits crawl depth to a specified number of subdirectories; -c limits the number of children per directory; -x limits the total number of descendants per crawl tree branch; and -r limits the total number of requests to send in a scan. -An interesting option is available for repeated assessments: -p. By -specifying a percentage between 1 and 100%, it is possible to tell the -crawler to follow fewer than 100% of all links, and try fewer than 100% of -all dictionary entries. This - naturally - limits the completeness of a scan, -but unlike most other settings, it does so in a balanced, non-deterministic -manner. It is extremely useful when you are setting up time-bound, but -periodic assessments of your infrastructure. Another related option is -q, -which sets the initial random seed for the crawler to a specified value. This -can be used to exactly reproduce a previous scan to compare results. -Randomness is relied upon most heavily in the -p mode, but also for making a +An interesting option is available for repeated assessments: -p. By +specifying a percentage between 1 and 100%, it is possible to tell the +crawler to follow fewer than 100% of all links, and try fewer than 100% of +all dictionary entries. This - naturally - limits the completeness of a scan, +but unlike most other settings, it does so in a balanced, non-deterministic +manner. It is extremely useful when you are setting up time-bound, but +periodic assessments of your infrastructure. Another related option is -q, +which sets the initial random seed for the crawler to a specified value. This +can be used to exactly reproduce a previous scan to compare results. +Randomness is relied upon most heavily in the -p mode, but also for making a couple of other scan management decisions elsewhere. -Some particularly complex (or broken) services may involve a very high number -of identical or nearly identical pages. Although these occurrences are by -default grayed out in the report, they still use up some screen estate and -take a while to process on JavaScript level. In such extreme cases, you may -use the -Q option to suppress reporting of duplicate nodes altogether, before -the report is written. This may give you a less comprehensive understanding +Some particularly complex (or broken) services may involve a very high number +of identical or nearly identical pages. Although these occurrences are by +default grayed out in the report, they still use up some screen estate and +take a while to process on JavaScript level. In such extreme cases, you may +use the -Q option to suppress reporting of duplicate nodes altogether, before +the report is written. This may give you a less comprehensive understanding of how the site is organized, but has no impact on test coverage. -In certain quick assessments, you might also have no interest in paying any -particular attention to the desired functionality of the site - hoping to -explore non-linked secrets only. In such a case, you may specify -P to -inhibit all HTML parsing. This limits the coverage and takes away the ability -for the scanner to learn new keywords by looking at the HTML, but speeds up -the test dramatically. Another similarly crippling option that reduces the -risk of persistent effects of a scan is -O, which inhibits all form parsing +In certain quick assessments, you might also have no interest in paying any +particular attention to the desired functionality of the site - hoping to +explore non-linked secrets only. In such a case, you may specify -P to +inhibit all HTML parsing. This limits the coverage and takes away the ability +for the scanner to learn new keywords by looking at the HTML, but speeds up +the test dramatically. Another similarly crippling option that reduces the +risk of persistent effects of a scan is -O, which inhibits all form parsing and submission steps. Some sites that handle sensitive user data care about SSL - and about getting @@ -368,45 +368,45 @@ this. The scanner will complain about situations such as http:// scripts being loaded on https:// pages - but will disregard non-risk scenarios such as images. -Likewise, certain pedantic sites may care about cases where caching is -restricted on HTTP/1.1 level, but no explicit HTTP/1.0 caching directive is -given on specifying -E in the command-line causes skipfish to log all such +Likewise, certain pedantic sites may care about cases where caching is +restricted on HTTP/1.1 level, but no explicit HTTP/1.0 caching directive is +given on specifying -E in the command-line causes skipfish to log all such cases carefully. -Lastly, in some assessments that involve self-contained sites without -extensive user content, the auditor may care about any external e-mails or -HTTP links seen, even if they have no immediate security impact. Use the -U +Lastly, in some assessments that involve self-contained sites without +extensive user content, the auditor may care about any external e-mails or +HTTP links seen, even if they have no immediate security impact. Use the -U option to have these logged. -Dictionary management is a special topic, and - as mentioned - is covered in -more detail in dictionaries/README-FIRST. Please read that file before -proceeding. Some of the relevant options include -W to specify a custom -wordlist, -L to suppress auto-learning, -V to suppress dictionary updates, -G -to limit the keyword guess jar size, -R to drop old dictionary entries, and +Dictionary management is a special topic, and - as mentioned - is covered in +more detail in dictionaries/README-FIRST. Please read that file before +proceeding. Some of the relevant options include -W to specify a custom +wordlist, -L to suppress auto-learning, -V to suppress dictionary updates, -G +to limit the keyword guess jar size, -R to drop old dictionary entries, and -Y to inhibit expensive $keyword.$extension fuzzing. -Skipfish also features a form auto-completion mechanism in order to maximize -scan coverage. The values should be non-malicious, as they are not meant to -implement security checks - but rather, to get past input validation logic. -You can define additional rules, or override existing ones, with the -T -option (-T form_field_name=field_value, e.g. -T login=test123 -T -password=test321 - although note that -C and -A are a much better method of +Skipfish also features a form auto-completion mechanism in order to maximize +scan coverage. The values should be non-malicious, as they are not meant to +implement security checks - but rather, to get past input validation logic. +You can define additional rules, or override existing ones, with the -T +option (-T form_field_name=field_value, e.g. -T login=test123 -T +password=test321 - although note that -C and -A are a much better method of logging in). -There is also a handful of performance-related options. Use -g to set the -maximum number of connections to maintain, globally, to all targets (it is -sensible to keep this under 50 or so to avoid overwhelming the TCP/IP stack -on your system or on the nearby NAT / firewall devices); and -m to set the -per-IP limit (experiment a bit: 2-4 is usually good for localhost, 4-8 for -local networks, 10-20 for external targets, 30+ for really lagged or -non-keep-alive hosts). You can also use -w to set the I/O timeout (i.e., -skipfish will wait only so long for an individual read or write), and -t to -set the total request timeout, to account for really slow or really fast +There is also a handful of performance-related options. Use -g to set the +maximum number of connections to maintain, globally, to all targets (it is +sensible to keep this under 50 or so to avoid overwhelming the TCP/IP stack +on your system or on the nearby NAT / firewall devices); and -m to set the +per-IP limit (experiment a bit: 2-4 is usually good for localhost, 4-8 for +local networks, 10-20 for external targets, 30+ for really lagged or +non-keep-alive hosts). You can also use -w to set the I/O timeout (i.e., +skipfish will wait only so long for an individual read or write), and -t to +set the total request timeout, to account for really slow or really fast sites. -Lastly, -f controls the maximum number of consecutive HTTP errors you are -willing to see before aborting the scan; and -s sets the maximum length of a -response to fetch and parse (longer responses will be truncated). +Lastly, -f controls the maximum number of consecutive HTTP errors you are +willing to see before aborting the scan; and -s sets the maximum length of a +response to fetch and parse (longer responses will be truncated). When scanning large, multimedia-heavy sites, you may also want to specify -e. This prevents binary documents from being kept in memory for reporting @@ -421,20 +421,20 @@ Oh, and real-time scan statistics can be suppressed with -u. 6. But seriously, how to run it? -------------------------------- -A standard, authenticated scan of a well-designed and self-contained site -(warns about all external links, e-mails, mixed content, and caching header +A standard, authenticated scan of a well-designed and self-contained site +(warns about all external links, e-mails, mixed content, and caching header issues): $ ./skipfish -MEU -C "AuthCookie=value" -X /logout.aspx -o output_dir \ http://www.example.com/ Five-connection crawl, but no brute-force; pretending to be MSIE and and -trusting example.com content): +trusting example.com content: $ ./skipfish -m 5 -LV -W /dev/null -o output_dir -b ie -B example.com \ http://www.example.com/ -Brute force only (no HTML link extraction), limited to a single directory and +Brute force only (no HTML link extraction), limited to a single directory and timing out after 5 seconds: $ ./skipfish -P -I http://www.example.com/dir1/ -o output_dir -t 5 -I \ @@ -471,15 +471,15 @@ applications. 8. Known limitations / feature wishlist --------------------------------------- -Below is a list of features currently missing in skipfish. If you wish to -improve the tool by contributing code in one of these areas, please let me +Below is a list of features currently missing in skipfish. If you wish to +improve the tool by contributing code in one of these areas, please let me know: * Buffer overflow checks: after careful consideration, I suspect there is no reliable way to test for buffer overflows remotely. Much like the actual fault condition we are looking for, proper buffer size checks may also result in uncaught exceptions, 500 messages, etc. I would love to be proved - wrong, though. + wrong, though. * Fully-fledged JavaScript XSS detection: several rudimentary checks are present in the code, but there is no proper script engine to evaluate @@ -490,15 +490,15 @@ know: they were much lower priority at the time of this writing. * Security checks and link extraction for third-party, plugin-based - content (Flash, Java, PDF, etc). + content (Flash, Java, PDF, etc). * Password brute-force and numerical filename brute-force probes. - * Search engine integration (vhosts, starting paths). + * Search engine integration (vhosts, starting paths). - * VIEWSTATE decoding. + * VIEWSTATE decoding. - * NTLM and digest authentication. + * NTLM and digest authentication. * More specific PHP tests (eval injection, RFI). @@ -506,7 +506,7 @@ know: a #define directive in config.h. Adding support for HTTPS proxying is more complicated, and still in the works. - * Scan resume option. + * Scan resume option, better runtime info. * Option to limit document sampling or save samples directly to disk. @@ -514,16 +514,20 @@ know: * Config file support. - * A database for banner / version checks? + * Scheduling and management web UI. + + * QPS throttling and maximum scan time limit. + + * A database for banner / version checks or other configurable rules? ------------------------------------- 9. Oy! Something went horribly wrong! ------------------------------------- -There is no web crawler so good that there wouldn't be a web framework to one -day set it on fire. If you encounter what appears to be bad behavior (e.g., a -scan that takes forever and generates too many requests, completely bogus -nodes in scan output, or outright crashes), please first check our known +There is no web crawler so good that there wouldn't be a web framework to one +day set it on fire. If you encounter what appears to be bad behavior (e.g., a +scan that takes forever and generates too many requests, completely bogus +nodes in scan output, or outright crashes), please first check our known issues page: http://code.google.com/p/skipfish/wiki/KnownIssues @@ -536,8 +540,8 @@ $ make clean debug $ ./skipfish [...previous options...] 2>logfile.txt -You can then inspect logfile.txt to get an idea what went wrong; if it looks -like a scanner problem, please scrub any sensitive information from the log +You can then inspect logfile.txt to get an idea what went wrong; if it looks +like a scanner problem, please scrub any sensitive information from the log file and send it to the author. If the scanner crashed, please recompile it as indicated above, and then type: @@ -552,8 +556,8 @@ $ gdb --batch -ex back ./skipfish core 10. Credits and feedback ------------------------ -Skipfish is made possible thanks to the contributions of, and valuable +Skipfish is made possible thanks to the contributions of, and valuable feedback from, Google's information security engineering team. -If you have any bug reports, questions, suggestions, or concerns regarding -the application, the author can be reached at lcamtuf@google.com. +If you have any bug reports, questions, suggestions, or concerns regarding +the application, the author can be reached at lcamtuf@google.com. diff --git a/analysis.c b/analysis.c index 882205f..3f9e632 100644 --- a/analysis.c +++ b/analysis.c @@ -930,7 +930,7 @@ add_link: i = 0; - while ((ext = wordlist_get_extension(i++))) { + while ((ext = wordlist_get_extension(i++, 0))) { u32 ext_len = strlen((char*)ext); if (clean_len > ext_len + 2 && @@ -2280,11 +2280,32 @@ static void check_for_stuff(struct http_request* req, return; } - if (strstr((char*)res->payload, "Fatal error:") || - strstr((char*)res->payload, "Parse error:") || - strstr((char*)res->payload, " on line ")) { - problem(PROB_ERROR_POI, req, res, (u8*)"PHP error", req->pivot, 0); - return; + if ((tmp = (u8*)strstr((char*)res->payload, " on line "))) { + u32 off = 512; + + while (tmp - 1 > res->payload && !strchr("\r\n", tmp[-1]) + && off--) tmp--; + + if (off && (!prefix(tmp, "Warning: ") || !prefix(tmp, "Notice: ") || + !prefix(tmp, "Fatal error: ") || !prefix(tmp, "Parse error: ") || + !prefix(tmp, "Deprecated: ") || + !prefix(tmp, "Strict Standards: ") || + !prefix(tmp, "Catchable fatal error: "))) { + problem(PROB_ERROR_POI, req, res, (u8*)"PHP error (text)", req->pivot, 0); + return; + } + + if (off && !prefix(tmp, "") && (!prefix(tmp + 3, "Warning: ") || + !prefix(tmp + 3, "Notice: ") || + !prefix(tmp + 3, "Fatal error: ") || + !prefix(tmp + 3, "Parse error: ") || + !prefix(tmp + 3, "Deprecated: ") || + !prefix(tmp + 3, "Strict Standards: ") || + !prefix(tmp + 3, "Catchable fatal error: "))) { + problem(PROB_ERROR_POI, req, res, (u8*)"PHP error (HTML)", req->pivot, 0); + return; + } + } if (strstr((char*)res->payload, "Warning: MySQL: ") || @@ -2326,12 +2347,26 @@ static void check_for_stuff(struct http_request* req, if (strstr((char*)sniffbuf, "")) { problem(PROB_FILE_POI, req, res, (u8*) "Flash cross-domain policy", req->pivot, 0); + + /* + if (strstr((char*)res->payload, "domain=\"*\"")) + problem(PROB_CROSS_WILD, req, res, (u8*) + "Cross-domain policy with wildcard rules", req->pivot, 0); + */ + return; } if (strstr((char*)sniffbuf, "")) { problem(PROB_FILE_POI, req, res, (u8*)"Silverlight cross-domain policy", req->pivot, 0); + + /* + if (strstr((char*)res->payload, "uri=\"*\"")) + problem(PROB_CROSS_WILD, req, res, (u8*) + "Cross-domain policy with wildcard rules", req->pivot, 0); + */ + return; } diff --git a/config.h b/config.h index 32f3aa5..06e2e93 100644 --- a/config.h +++ b/config.h @@ -27,9 +27,9 @@ #define SHOW_SPLASH 1 /* Annoy user with a splash screen */ -/* Define this to enable experimental HTTP proxy support, through the -J +/* Define this to enable experimental HTTP proxy support, through the -J option in the command line. This mode will not work as expected for - HTTPS requests at this point. */ + HTTPS requests at this time - sorry. */ // #define PROXY_SUPPORT 1 diff --git a/crawler.c b/crawler.c index 9ca8888..2cda295 100644 --- a/crawler.c +++ b/crawler.c @@ -354,7 +354,7 @@ static void secondary_ext_init(struct pivot_desc* pv, struct http_request* req, i = 0; - while ((ex = wordlist_get_extension(i))) { + while ((ex = wordlist_get_extension(i, 0))) { u8* tmp = ck_alloc(strlen((char*)base_name) + strlen((char*)ex) + 2); u32 c; @@ -382,6 +382,7 @@ static void secondary_ext_init(struct pivot_desc* pv, struct http_request* req, n->par.v[tpar] = tmp; n->user_val = 1; + n->with_ext = 1; memcpy(&n->same_sig, &res->sig, sizeof(struct http_sig)); @@ -1814,6 +1815,7 @@ static void crawl_par_dict_init(struct pivot_desc* pv) { struct http_request* n; u8 *kw, *ex; u32 i, c; + u8 specific; /* Too many requests still pending, or already done? */ @@ -1832,7 +1834,7 @@ restart_dict: i = 0; kw = (pv->pdic_guess ? wordlist_get_guess : wordlist_get_word) - (pv->pdic_cur_key); + (pv->pdic_cur_key, &specific); if (!kw) { @@ -1878,10 +1880,11 @@ restart_dict: /* Schedule probes for all extensions for the current word, but only if the original parameter contained '.' somewhere, - and only if string is not on the try list. */ + and only if string is not on the try list. Special handling + for specific keywords with '.' inside. */ - if (strchr((char*)TPAR(pv->req), '.')) - while (!no_fuzz_ext && (ex = wordlist_get_extension(i))) { + if (!no_fuzz_ext && strchr((char*)TPAR(pv->req), '.')) + while ((ex = wordlist_get_extension(i, specific))) { u8* tmp = ck_alloc(strlen((char*)kw) + strlen((char*)ex) + 2); @@ -1901,6 +1904,7 @@ restart_dict: ck_free(TPAR(n)); TPAR(n) = tmp; n->callback = par_dict_callback; + n->with_ext = 1; pv->pdic_pending++; in_dict_init = 1; async_request(n); @@ -2333,6 +2337,7 @@ static u8 dir_404_callback(struct http_request* req, } memcpy(&req->pivot->r404[i], &res->sig, sizeof(struct http_sig)); + req->pivot->r404_cnt++; /* Is this a new signature not seen on parent? Notify if so, @@ -2379,7 +2384,7 @@ schedule_next: /* Aaand schedule all the remaining probes. */ - while ((nk = wordlist_get_extension(cur_ext++))) { + while ((nk = wordlist_get_extension(cur_ext++, 0))) { u8* tmp = ck_alloc(strlen(BOGUS_FILE) + strlen((char*)nk) + 2); n = req_copy(RPREQ(req), req->pivot, 1); @@ -2388,6 +2393,7 @@ schedule_next: replace_slash(n, tmp); ck_free(tmp); n->callback = dir_404_callback; + n->with_ext = 1; n->user_val = 1; /* r404_pending is at least 1 to begin with, so this is safe @@ -2655,6 +2661,7 @@ static void crawl_dir_dict_init(struct pivot_desc* pv) { struct http_request* n; u8 *kw, *ex; u32 i, c; + u8 specific; /* Too many requests still pending, or already moved on to parametric tests? */ @@ -2682,7 +2689,8 @@ static void crawl_dir_dict_init(struct pivot_desc* pv) { restart_dict: - kw = (pv->guess ? wordlist_get_guess : wordlist_get_word)(pv->cur_key); + kw = (pv->guess ? wordlist_get_guess : wordlist_get_word) + (pv->cur_key, &specific); if (!kw) { @@ -2739,39 +2747,42 @@ restart_dict: } /* Schedule probes for all extensions for the current word, - likewise. */ + likewise. Make an exception for specific keywords that + already contain a period. */ i = 0; - while (!no_fuzz_ext && (ex = wordlist_get_extension(i))) { + if (!no_fuzz_ext) + while ((ex = wordlist_get_extension(i, specific))) { - u8* tmp = ck_alloc(strlen((char*)kw) + strlen((char*)ex) + 2); + u8* tmp = ck_alloc(strlen((char*)kw) + strlen((char*)ex) + 2); - sprintf((char*)tmp, "%s.%s", kw, ex); + sprintf((char*)tmp, "%s.%s", kw, ex); - for (c=0;cchild_cnt;c++) - if (!((is_c_sens(pv) ? strcmp : strcasecmp)((char*)tmp, - (char*)pv->child[c]->name))) break; + for (c=0;cchild_cnt;c++) + if (!((is_c_sens(pv) ? strcmp : strcasecmp)((char*)tmp, + (char*)pv->child[c]->name))) break; - if (pv->fuzz_par != -1 && - !((is_c_sens(pv) ? strcmp : strcasecmp)((char*)tmp, - (char*)pv->req->par.v[pv->fuzz_par]))) c = pv->child_cnt; + if (pv->fuzz_par != -1 && + !((is_c_sens(pv) ? strcmp : strcasecmp)((char*)tmp, + (char*)pv->req->par.v[pv->fuzz_par]))) c = pv->child_cnt; - if (c == pv->child_cnt) { - n = req_copy(pv->req, pv, 1); - replace_slash(n, tmp); - n->callback = dir_dict_callback; - pv->pending++; - in_dict_init = 1; - async_request(n); - in_dict_init = 0; + if (c == pv->child_cnt) { + n = req_copy(pv->req, pv, 1); + replace_slash(n, tmp); + n->callback = dir_dict_callback; + n->with_ext = 1; + pv->pending++; + in_dict_init = 1; + async_request(n); + in_dict_init = 0; + } + + ck_free(tmp); + + i++; } - ck_free(tmp); - - i++; - } - } pv->cur_key++; @@ -2917,6 +2928,7 @@ u8 fetch_unknown_callback(struct http_request* req, struct http_response* res) { n = req_copy(req, req->pivot, 1); set_value(PARAM_PATH, NULL, (u8*)"", -1, &n->par); n->callback = unknown_check_callback; + n->with_ext = req->with_ext; async_request(n); /* This is the initial callback, keep the response. */ @@ -2974,13 +2986,34 @@ static u8 unknown_check_callback(struct http_request* req, } - if (par) + if (par) { for (i=0;ir404_cnt;i++) if (same_page(&res->sig, &par->r404[i])) break; + /* Do not use extension-originating signatures for settling non-extension + cases. */ + + if (i && !req->with_ext) i = par->r404_cnt; + + } + if ((!par && res->code == 404) || (par && i != par->r404_cnt) || (RPRES(req)->code < 300 && res->code >= 300 && RPRES(req)->pay_len)) { +DEBUG("REASON X\n"); +if (par) DEBUG("same_404 = %d\n", i != par->r404_cnt); +DEBUG("par = %p\n", par); +if (par) DEBUG("par->r404_cnt = %d\n", par->r404_cnt); +DEBUG("res->code = %d\n", res->code); +DEBUG("parent code = %d\n", RPRES(req)->code); +DEBUG("parent len = %d\n", RPRES(req)->pay_len); + +// (!par && res->code == 404) || - NIE +// (par && i != par->r404_cnt) || - TAK +// (RPRES(req)->code < 300 && res->code >= 300 && RPRES(req)->pay_len)) + + + req->pivot->type = PIVOT_FILE; } else { @@ -2999,6 +3032,11 @@ assume_dir: req->pivot->type = PIVOT_DIR; + /* Perform content checks before discarding the old payload. */ + + if (!same_page(&RPRES(req)->sig, &res->sig)) + content_checks(RPREQ(req), RPRES(req)); + /* Replace original request, response with new data. */ destroy_request(RPREQ(req)); diff --git a/database.c b/database.c index 63e84be..4f261b5 100644 --- a/database.c +++ b/database.c @@ -57,11 +57,17 @@ u32 max_depth = MAX_DEPTH, u8 dont_add_words; /* No auto dictionary building */ +#define KW_SPECIFIC 0 +#define KW_GENERIC 1 +#define KW_GEN_AUTO 2 + struct kw_entry { u8* word; /* Keyword itself */ u32 hit_cnt; /* Number of confirmed sightings */ u8 is_ext; /* Is an extension? */ u8 hit_already; /* Had its hit count bumped up? */ + u8 read_only; /* Read-only dictionary? */ + u8 class; /* KW_* */ u32 total_age; /* Total age (in scan cycles) */ u32 last_age; /* Age since last hit */ }; @@ -71,11 +77,19 @@ static struct kw_entry* static u32 keyword_cnt[WORD_HASH]; /* Per-bucket keyword counts */ -static u8 **extension, /* Extension list */ - **guess; /* Keyword candidate list */ +struct ext_entry { + u32 bucket; + u32 index; +}; + +static struct ext_entry *extension, /* Extension list */ + *sp_extension; + +static u8 **guess; /* Keyword candidate list */ u32 guess_cnt, /* Number of keyword candidates */ extension_cnt, /* Number of extensions */ + sp_extension_cnt, /* Number of specific extensions */ keyword_total_cnt, /* Current keyword count */ keyword_orig_cnt; /* At-boot keyword count */ @@ -818,7 +832,7 @@ static inline u32 hash_word(u8* str) { /* Adds a new keyword candidate to the global "guess" list. This - list is always case-insensitive. */ + list is case-sensitive. */ void wordlist_add_guess(u8* text) { u32 target, i, kh; @@ -830,7 +844,7 @@ void wordlist_add_guess(u8* text) { if (!text || !text[0] || strlen((char*)text) > MAX_WORD) return; for (i=0;i MAX_WORD) return; @@ -866,7 +880,7 @@ static void wordlist_confirm_single(u8* text, u8 is_ext, u32 add_hits, kh = hash_word(text); for (i=0;i 12) return; if (ppos && ppos != tlen - 1 && !isdigit(text[ppos] + 1)) { - wordlist_confirm_single(text + ppos + 1, 1, 1, 0, 0); + wordlist_confirm_single(text + ppos + 1, 1, KW_GEN_AUTO, 0, 1, 0, 0); text[ppos] = 0; - wordlist_confirm_single(text, 0, 1, 0, 0); + wordlist_confirm_single(text, 0, KW_GEN_AUTO, 0, 1, 0, 0); text[ppos] = '.'; return; } } - wordlist_confirm_single(text, 0, 1, 0, 0); + wordlist_confirm_single(text, 0, KW_GEN_AUTO, 0, 1, 0, 0); } /* Returns wordlist item at a specified offset (NULL if no more available). */ -u8* wordlist_get_word(u32 offset) { +u8* wordlist_get_word(u32 offset, u8* specific) { u32 cur_off = 0, kh; for (kh=0;kh= guess_cnt) return NULL; + *specific = 0; return guess[offset]; } /* Returns extension at a specified offset (or NULL). */ -u8* wordlist_get_extension(u32 offset) { - if (offset >= extension_cnt) return NULL; - return extension[offset]; +u8* wordlist_get_extension(u32 offset, u8 specific) { + + if (!specific) { + if (offset >= extension_cnt) return NULL; + return keyword[extension[offset].bucket][extension[offset].index].word; + } + + if (offset >= sp_extension_cnt) return NULL; + return keyword[sp_extension[offset].bucket][sp_extension[offset].index].word; } /* Loads keywords from file. */ -void load_keywords(u8* fname, u32 purge_age) { +void load_keywords(u8* fname, u8 read_only, u32 purge_age) { FILE* in; u32 hits, total_age, last_age, lines = 0; - u8 type; + u8 type[3]; s32 fields; u8 kword[MAX_WORD + 1]; char fmt[32]; @@ -1036,19 +1093,28 @@ void load_keywords(u8* fname, u32 purge_age) { return; } - sprintf(fmt, "%%c %%u %%u %%u %%%u[^\x01-\x1f]", MAX_WORD); + sprintf(fmt, "%%2s %%u %%u %%u %%%u[^\x01-\x1f]", MAX_WORD); - while ((fields = fscanf(in, fmt, &type, &hits, &total_age, &last_age, kword)) + while ((fields = fscanf(in, fmt, type, &hits, &total_age, &last_age, kword)) == 5) { + + u8 class = KW_GEN_AUTO; + + if (type[0] != 'e' && type[0] != 'w') + FATAL("Wordlist '%s': bad keyword type in line %u.\n", fname, lines + 1); + + if (type[1] == 's') class = KW_SPECIFIC; else + if (type[1] == 'g') class = KW_GENERIC; + if (!purge_age || last_age < purge_age) - wordlist_confirm_single(kword, (type == 'e'), hits, + wordlist_confirm_single(kword, (type[0] == 'e'), class, read_only, hits, total_age + 1, last_age + 1); lines++; fgetc(in); /* sink \n */ } if (fields != -1 && fields != 5) - FATAL("Wordlist '%s': syntax error in line %u.\n", fname, lines + 1); + FATAL("Wordlist '%s': syntax error in line %u.\n", fname, lines); if (!lines) WARN("Wordlist '%s' contained no valid entries.", fname); @@ -1110,11 +1176,21 @@ void save_keywords(u8* fname) { } for (kh=0;kh candidates are randomly selected from site + content, and periodically retried during brute-force checks; + when one of them results in a unique non-404 response, it is + promoted to the dictionary proper. Unsuccessful candidates are + gradually replaced with new picks, and then discarded at the + end of the scan. The default jar size is 256. + + -V - prevent the scanner from updating the dictionary file. + + Normally, the primary read-write dictionary specified with the + -W option is updated at the end of the scan to add any newly + discovered keywords, and to update keyword usage stats. Using + this option eliminates this step. + + -R num - purge all dictionary entries that had no non-404 hits for + the last scans. + + This option prevents dictionary creep in repeated assessments, + but needs to be used with care: it will permanently nuke a + part of the dictionary! + + -Y - inhibit full ${filename}.${extension} brute-force. + + In this mode, the scanner will only brute-force one component + at a time, trying all possible keywords without any extension, + and then trying to append extensions to any otherwise discovered + content. + + This greatly improves scan times, but reduces coverage. Scan modes + 2 and 3 shown in the next section make use of this flag. + +-------------- +Scanning modes +-------------- + +The basic dictionary-dependent modes you should be aware of (in order of the +associated request cost): 1) Orderly crawl with no DirBuster-like brute-force at all. In this mode, the scanner will not discover non-linked resources such as /admin, @@ -20,24 +84,25 @@ The basic modes you should be aware of (in order of request cost): 2) Orderly scan with minimal extension brute-force. In this mode, the scanner will not discover resources such as /admin, but will discover cases such as - /index.php.old: + /index.php.old (once index.php itself is spotted during an orderly crawl): cp dictionaries/extensions-only.wl dictionary.wl ./skipfish -W dictionary.wl -Y [...other options...] This method is only slightly more request-intensive than #1, and therefore, - generally recommended in cases where time is of essence. The cost is about - 100 requests per fuzzed location. + is a marginally better alternative in cases where time is of essence. It's + still not recommended for most uses. The cost is about 100 requests per + fuzzed location. 3) Directory OR extension brute-force only. In this mode, the scanner will only - try fuzzing the file name, or the extension, at any given time - but will + try fuzzing the file name, or the extension, at any given time - but will not try every possible ${filename}.${extension} pair from the dictionary. cp dictionaries/complete.wl dictionary.wl ./skipfish -W dictionary.wl -Y [...other options...] This method has a cost of about 2,000 requests per fuzzed location, and is - recommended for rapid assessments, especially when working with slow + recommended for rapid assessments, especially when working with slow servers or very large services. 4) Normal dictionary fuzzing. In this mode, every ${filename}.${extension} @@ -61,41 +126,29 @@ The basic modes you should be aware of (in order of request cost): reasonably responsive servers; but it may be prohibitively expensive when dealing with very large or very slow sites. -As should be obvious, the -W option points to a dictionary to be used; the -scanner updates the file based on scan results, so please always make a -target-specific copy - do not use the master file directly, or it may be -polluted with keywords not relevant to other targets. +---------------------------------- +Using separate master dictionaries +---------------------------------- -Additional options supported by the aforementioned modes: +A recently introduced feature allows you to load any number of read-only +supplementary dictionaries in addition to the "main" read-write one (-W +dictionary.wl). - -L - do not automatically learn new keywords based on site content. - This option should not be normally used in most scanning - modes; *not* using it significantly improves the coverage of - minimal.wl. +This is a convenient way to isolate (and be able to continually update) your +customized top-level wordlist, whilst still acquiring site-specific data in +a separate file. The following syntax may be used to accomplish this: - -G num - specifies jar size for keyword candidates selected from the - content; up to candidates are kept and tried during - brute-force checks; when one of them results in a unique - non-404 response, it is promoted to the dictionary proper. +./skipfish -W initially_empty_site_specific_dict.wl -W +supplementary_dict1.wl \ + -W +supplementary_dict2.wl [...other options...] - -V - prevents the scanner from updating the dictionary file with - newly discovered keywords and keyword usage stats (i.e., all - new findings are discarded on exit). +Only the main dictionary will be modified as a result of the scan, and only +newly discovered site-specific keywords will be appended there. - -Y - inhibits full ${filename}.${extension} brute-force: the scanner - will only brute-force one component at a time. This greatly - improves scan times, but reduces coverage. Modes 2 and 3 - shown above make use of this flag. +---------------------------- +More about dictionary design +---------------------------- - -R num - purges all dictionary entries that had no non-404 hits for - the last scans. Prevents dictionary creep in repeated - assessments, but use with care! - ------------------------------ -More about dictionary design: ------------------------------ - -Each dictionary may consist of a number of extensions, and a number of +Each dictionary may consist of a number of extensions, and a number of "regular" keywords. Extensions are considered just a special subset of the keyword list. @@ -103,29 +156,74 @@ You can create custom dictionaries, conforming to this format: type hits total_age last_age keyword -...where 'type' is either 'e' or 'w' (extension or wordlist); 'hits' is the -total number of times this keyword resulted in a non-404 hit in all previous -scans; 'total_age' is the number of scan cycles this word is in the dictionary; -'last_age' is the number of scan cycles since the last 'hit'; and 'keyword' is -the actual keyword. +...where 'type' is either 'e' or 'w' (extension or keyword), followed by a +qualifier (explained below); 'hits' is the total number of times this keyword +resulted in a non-404 hit in all previous scans; 'total_age' is the number of scan +cycles this word is in the dictionary; 'last_age' is the number of scan cycles +since the last 'hit'; and 'keyword' is the actual keyword. -Do not duplicate extensions as keywords - if you already have 'html' as an 'e' -entry, there is no need to also create a 'w' one. +Qualifiers alter the meaning of an entry in the following way: -There must be no empty or malformed lines, comments in the wordlist file. -Extension keywords must have no leading dot (e.g., 'exe', not '.exe'), and all -keywords should be NOT url-encoded (e.g., 'Program Files', not -'Program%20Files'). No keyword should exceed 64 characters. + wg - generic keyword that is not associated with any specific server-side + technology. Examples include 'backup', 'accounting', or 'logs'. These + will be indiscriminately combined with every known extension (e.g., + 'backup.php') during the fuzzing process. -If you omit -W in the command line, 'skipfish.wl' is assumed. This file does -not exist by default; this is by design. + ws - technology-specific keyword that are unlikely to have a random + extension; for example, with 'cgi-bin', testing for 'cgi-bin.php' is + usually a waste of time. Keywords tagged this way will be combined only + with a small set of technology-agnostic extensions - e.g., 'cgi-bin.old'. -The scanner will automatically learn new keywords and extensions based on any -links discovered during the scan; and will also analyze pages and extract -words to use as keyword candidates. + NOTE: Technology-specific keywords that in the real world, are always + paired with a single, specific extension, should be combined with said + extension in the 'ws' entry itself, rather than trying to accommodate + them with 'wg' rules. For example, 'MANIFEST.MF' is OK. -Tread carefully; poor wordlists are one of the reasons why some web security -scanners perform worse than expected. You will almost always be better off -narrowing down or selectively extending the supplied set (and possibly -contributing back your changes upstream!), than importing a giant wordlist -scored elsewhere. + eg - generic extension that is not specific to any well-defined technology, + or may pop-up in administrator- or developer-created auxiliary content. + Examples include 'bak', 'old', 'txt', or 'log'. + + es - technology-specific extension, such as 'php', or 'cgi', that are + unlikely to spontaneously accompany random 'ws' keywords. + +Skipfish leverages this distinction by only trying the following brute-force +combinations: + + /some/path/wg_keyword ('index') + /some/path/ws_keyword ('cgi-bin') + /some/path/wg_extension ('old') + /some/path/ws_extension ('php') + + /some/path/wg_keyword.wg_extension ('index.old') + /some/path/wg_keyword.ws_extension ('index.php') + + /some/path/ws_keyword.ws_extension ('cgi-bin.old') + +To decide between 'wg' and 'ws', consider if you are likely to ever encounter +files such as ${this_word}.php or ${this_word}.class. If not, tag the keyword +as 'ws'. + +Similarly, to decide between 'eg' and 'es', think about the possibility of +encoutering cgi-bin.${this_ext} or formmail.${this_ext}. If it seems unlikely, +choose 'es'. + +For your convenience, all legacy keywords and extensions, as well as any entries +detected automatically, will be stored in the dictionary with a '?' qualifier. +This is equivalent to 'g', and is meant to assist the user in reviewing and +triaging any automatically acquired dictionary data. + +Other notes about dictionaries: + + - Do not duplicate extensions as keywords - if you already have 'html' as an + 'e' entry, there is no need to also create a 'w' one. + + - There must be no empty or malformed lines, or comments, in the wordlist + file. Extension keywords must have no leading dot (e.g., 'exe', not '.exe'), + and all keywords should be NOT url-encoded (e.g., 'Program Files', not + 'Program%20Files'). No keyword should exceed 64 characters. + + - Tread carefully; poor wordlists are one of the reasons why some web security + scanners perform worse than expected. You will almost always be better off + narrowing down or selectively extending the supplied set (and possibly + contributing back your changes upstream!), than importing a giant wordlist + scored elsewhere. diff --git a/dictionaries/complete.wl b/dictionaries/complete.wl index 4c9a4a9..42e753f 100644 --- a/dictionaries/complete.wl +++ b/dictionaries/complete.wl @@ -16,6 +16,7 @@ e 1 1 1 class e 1 1 1 cnf e 1 1 1 conf e 1 1 1 config +e 1 1 1 core e 1 1 1 cpp e 1 1 1 cs e 1 1 1 csproj @@ -587,7 +588,6 @@ w 1 1 1 cookies w 1 1 1 copies w 1 1 1 copy w 1 1 1 copyright -w 1 1 1 core w 1 1 1 corp w 1 1 1 corpo w 1 1 1 corporate diff --git a/dictionaries/extensions-only.wl b/dictionaries/extensions-only.wl index 335f566..732f894 100644 --- a/dictionaries/extensions-only.wl +++ b/dictionaries/extensions-only.wl @@ -16,6 +16,7 @@ e 1 1 1 class e 1 1 1 cnf e 1 1 1 conf e 1 1 1 config +e 1 1 1 core e 1 1 1 cpp e 1 1 1 cs e 1 1 1 csproj diff --git a/dictionaries/medium.wl b/dictionaries/medium.wl index 94a530b..db94966 100644 --- a/dictionaries/medium.wl +++ b/dictionaries/medium.wl @@ -11,6 +11,7 @@ e 1 1 1 class e 1 1 1 cnf e 1 1 1 conf e 1 1 1 config +e 1 1 1 core e 1 1 1 cpp e 1 1 1 csproj e 1 1 1 csv @@ -556,7 +557,6 @@ w 1 1 1 cookies w 1 1 1 copies w 1 1 1 copy w 1 1 1 copyright -w 1 1 1 core w 1 1 1 corp w 1 1 1 corpo w 1 1 1 corporate diff --git a/http_client.c b/http_client.c index fcebb4e..a79d964 100644 --- a/http_client.c +++ b/http_client.c @@ -565,6 +565,22 @@ void tokenize_path(u8* str, struct http_request* req, u8 add_slash) { value = url_decode_token(cur + !first_el, next_seg - !first_el, 0); } + /* If the extracted segment is just '.' or '..', but is followed by + something else than '/', skip one separator. */ + + if (!name && cur[next_seg] && cur[next_seg] != '/' && + (!strcmp((char*)value, ".") || !strcmp((char*)value, ".."))) { + + next_seg = strcspn((char*)cur + next_seg + 1, "/;,!$?#") + next_seg + 1, + + ck_free(name); + ck_free(value); + + value = url_decode_token(cur + !first_el, next_seg - !first_el, 0); + + } + + switch (first_el ? '/' : *cur) { case ';': set_value(PARAM_PATH_S, name, value, -1, &req->par); break; diff --git a/http_client.h b/http_client.h index 9e0ceca..2efce7a 100644 --- a/http_client.h +++ b/http_client.h @@ -97,12 +97,12 @@ struct http_request { u16 port; /* Port number to connect to */ u8* orig_url; /* Copy of the original URL */ - struct param_array par; /* Parameters, headers, cookies */ struct pivot_desc *pivot; /* Pivot descriptor */ u32 user_val; /* Can be used freely */ + u8 with_ext; /* Extension-based probe? */ u8 (*callback)(struct http_request*, struct http_response*); /* Callback to invoke when done */ diff --git a/report.c b/report.c index c4c0bb7..093c2ef 100644 --- a/report.c +++ b/report.c @@ -303,7 +303,7 @@ static void compute_counts(struct pivot_desc* pv) { /* Helper to JS-escape data. Static buffer, will be destroyed on subsequent calls. */ -static inline u8* js_escape(u8* str) { +static inline u8* js_escape(u8* str, u8 sp) { u32 len; static u8* ret; u8* opos; @@ -316,7 +316,7 @@ static inline u8* js_escape(u8* str) { opos = ret = __DFL_ck_alloc(len * 4 + 1); while (len--) { - if (*str > 0x1f && *str < 0x80 && !strchr("<>\\'\"", *str)) { + if (*str > (sp ? 0x20 : 0x1f) && *str < 0x80 && !strchr("<>\\'\"", *str)) { *(opos++) = *(str++); } else { sprintf((char*)opos, "\\x%02x", *(str++)); @@ -343,7 +343,7 @@ static void output_scan_info(u64 scan_time, u32 seed) { if (!f) PFATAL("Cannot open 'summary.js'"); fprintf(f, "var sf_version = '%s';\n", VERSION); - fprintf(f, "var scan_date = '%s';\n", js_escape(ct)); + fprintf(f, "var scan_date = '%s';\n", js_escape(ct, 0)); fprintf(f, "var scan_seed = '0x%08x';\n", seed); fprintf(f, "var scan_ms = %llu;\n", (long long)scan_time); @@ -370,12 +370,12 @@ static void describe_res(FILE* f, struct http_response* res) { case STATE_OK: fprintf(f, "'fetched': true, 'code': %u, 'len': %u, 'decl_mime': '%s', ", res->code, res->pay_len, - js_escape(res->header_mime)); + js_escape(res->header_mime, 0)); fprintf(f, "'sniff_mime': '%s', 'cset': '%s'", res->sniffed_mime ? res->sniffed_mime : (u8*)"[none]", js_escape(res->header_charset ? res->header_charset - : res->meta_charset)); + : res->meta_charset, 0)); break; case STATE_DNSERR: @@ -514,18 +514,18 @@ static void output_crawl_tree(struct pivot_desc* pv) { fprintf(f, " { 'dupe': %s, 'type': %u, 'name': '%s%s", pv->child[i]->dupe ? "true" : "false", - pv->child[i]->type, js_escape(pv->child[i]->name), + pv->child[i]->type, js_escape(pv->child[i]->name, 0), (pv->child[i]->fuzz_par == -1 || pv->child[i]->type == PIVOT_VALUE) ? (u8*)"" : (u8*)"="); fprintf(f, "%s', 'dir': '%s', 'linked': %d, ", (pv->child[i]->fuzz_par == -1 || pv->child[i]->type == PIVOT_VALUE) ? (u8*)"" : - js_escape(pv->child[i]->req->par.v[pv->child[i]->fuzz_par]), + js_escape(pv->child[i]->req->par.v[pv->child[i]->fuzz_par], 0), tmp, pv->child[i]->linked); p = serialize_path(pv->child[i]->req, 1, 1); - fprintf(f, "'url': '%s', ", js_escape(p)); + fprintf(f, "'url': '%s', ", js_escape(p, 0)); ck_free(p); describe_res(f, pv->child[i]->res); @@ -557,7 +557,7 @@ static void output_crawl_tree(struct pivot_desc* pv) { fprintf(f, " { 'severity': %u, 'type': %u, 'extra': '%s', ", PSEV(pv->issue[i].type) - 1, pv->issue[i].type, - pv->issue[i].extra ? js_escape(pv->issue[i].extra) : (u8*)""); + pv->issue[i].extra ? js_escape(pv->issue[i].extra, 0) : (u8*)""); describe_res(f, pv->issue[i].res); @@ -658,7 +658,7 @@ static void output_summary_views() { save_req_res(m_samp[i].req[c], m_samp[i].res[c], 0); if (chdir("..")) PFATAL("chdir unexpectedly fails!"); fprintf(f, " { 'url': '%s', 'dir': '%s/%s', 'linked': %d, 'len': %d" - " }%s\n", js_escape(p), tmp, tmp2, + " }%s\n", js_escape(p, 0), tmp, tmp2, m_samp[i].req[c]->pivot->linked, m_samp[i].res[c]->pay_len, (c == use_samp - 1) ? " ]" : ","); ck_free(p); @@ -693,9 +693,9 @@ static void output_summary_views() { if (chdir((char*)tmp2)) PFATAL("chdir unexpectedly fails!"); save_req_res(i_samp[i].i[c]->req, i_samp[i].i[c]->res, 0); if (chdir("..")) PFATAL("chdir unexpectedly fails!"); - fprintf(f, " { 'url': '%s', ", js_escape(p)); + fprintf(f, " { 'url': '%s', ", js_escape(p, 0)); fprintf(f, "'extra': '%s', 'dir': '%s/%s' }%s\n", - i_samp[i].i[c]->extra ? js_escape(i_samp[i].i[c]->extra) : + i_samp[i].i[c]->extra ? js_escape(i_samp[i].i[c]->extra, 0) : (u8*)"", tmp, tmp2, (c == use_samp - 1) ? " ]" : ","); ck_free(p); @@ -763,10 +763,12 @@ static void save_pivots(FILE* f, struct pivot_desc* cur) { u8* url = serialize_path(cur->req, 1, 1); fprintf(f, "%s %s ", cur->req->method ? cur->req->method : (u8*)"GET", - js_escape(url)); + js_escape(url, 0)); ck_free(url); + fprintf(f, "name=%s ", js_escape(cur->name, 1)); + switch (cur->type) { case PIVOT_SERV: fprintf(f, "type=serv "); break; case PIVOT_DIR: fprintf(f, "type=dir "); break; @@ -785,7 +787,8 @@ static void save_pivots(FILE* f, struct pivot_desc* cur) { } if (cur->res) - fprintf(f, "dup=%u %scode=%u len=%u notes=%u\n", cur->dupe, + fprintf(f, "dup=%u %s%scode=%u len=%u notes=%u\n", cur->dupe, + cur->bogus_par ? "bogus " : "", cur->missing ? "returns_404 " : "", cur->res->code, cur->res->pay_len, cur->issue_cnt); else diff --git a/skipfish.1 b/skipfish.1 index 9cf2e31..e047838 100644 --- a/skipfish.1 +++ b/skipfish.1 @@ -83,10 +83,6 @@ do not parse HTML and other documents to find new links .B \-o dir write output to specified directory (required) .TP -.B \-J -be less noisy about MIME / charset mismatches on probably -static content -.TP .B \-M log warnings about mixed content or non-SSL password forms .TP @@ -147,6 +143,9 @@ timeout on idle HTTP connections (default: 10 s) .TP .B \-s s_limit response size limit (default: 200000 B) +.TP +.B \-e +do not keep binary responses for reporting .TP .B \-h, \-\-help diff --git a/skipfish.c b/skipfish.c index 496bd74..30cf589 100644 --- a/skipfish.c +++ b/skipfish.c @@ -239,7 +239,7 @@ int main(int argc, char** argv) { u32 loop_cnt = 0, purge_age = 0, seed; u8 dont_save_words = 0, show_once = 0, be_quiet = 0, display_mode = 0, has_fake = 0; - u8 *wordlist = (u8*)DEF_WORDLIST, *output_dir = NULL; + u8 *wordlist = NULL, *output_dir = NULL; struct termios term; struct timeval tv; @@ -421,7 +421,12 @@ int main(int argc, char** argv) { break; case 'W': - wordlist = (u8*)optarg; + if (optarg[0] == '+') load_keywords((u8*)optarg + 1, 1, 0); + else { + if (wordlist) + FATAL("Only one -W parameter permitted (unless '+' used)."); + wordlist = (u8*)optarg; + } break; case 'b': @@ -526,7 +531,9 @@ int main(int argc, char** argv) { if (max_connections < max_conn_host) max_connections = max_conn_host; - load_keywords((u8*)wordlist, purge_age); + if (!wordlist) wordlist = (u8*)DEF_WORDLIST; + + load_keywords(wordlist, 0, purge_age); /* Schedule all URLs in the command line for scanning. */