diff --git a/.codespellignore b/.codespellignore index 909a080..b75613e 100644 --- a/.codespellignore +++ b/.codespellignore @@ -1 +1,2 @@ homogenous +ccompiler diff --git a/src/demystify/demystify.py b/src/demystify/demystify.py index 508fdc4..18a26ec 100644 --- a/src/demystify/demystify.py +++ b/src/demystify/demystify.py @@ -38,9 +38,11 @@ from .denylist_template import denylist_template from .libs import version +from .libs.AnalysisResultsClass import AnalysisResults from .libs.DemystifyAnalysisClass import AnalysisError, DemystifyAnalysis from .libs.HandleDenylistClass import HandleDenylist from .libs.IdentifyDatabase import IdentifyDB +from .libs.outputhandlers import noclasshtml as nc # Custom output handlers from .libs.outputhandlers.htmloutputclass import FormatAnalysisHTMLOutput @@ -109,7 +111,12 @@ def _handle_denylist_config() -> tuple: def handle_output( - analysis_results, txtout=False, rogues=False, heroes=False, rogueconfig=None + analysis_results: AnalysisResults, + txtout: bool = False, + legacy: bool = False, + rogues: bool = False, + heroes: bool = False, + rogueconfig: ConfigParser = None, ): """Handle output from the analysis. @@ -132,19 +139,29 @@ def handle_output( logger.info("outputting text report") textoutput = FormatAnalysisTextOutput(analysis_results) print(textoutput.printTextResults()) - elif rogues is True: + return + + if rogues is True: logger.info(ROGUES_TEXT) rogueoutput = rogueoutputclass(analysis_results, rogueconfig) rogueoutput.printTextResults() - elif heroes is True: + return + + if heroes is True: logger.info(ROGUES_TEXT) rogueoutput = rogueoutputclass(analysis_results, rogueconfig, heroes) rogueoutput.printTextResults() - else: + return + + if legacy: logger.info("Outputting HTML report") htmloutput = FormatAnalysisHTMLOutput(analysis_results) print(htmloutput.printHTMLResults()) + else: + htm = nc.html(analysis_results) + print(htm) + def analysis_from_database( database_connection=sqlite3.Connection, @@ -216,9 +233,14 @@ def analysis_from_csv( database_connection = sqlitefid.identify_and_process_input(format_report) if not database_connection: logger.error("no database result: %s", database_connection) - return "ensure that the input file is one of the supported DROID CSV, or Siegfried YAML types." + logger.error( + "ensure that the input file is one of the supported DROID CSV, or Siegfried YAML types." + ) + sys.exit(1) if not analyze: - logger.error("analysis flag is not set: %s", analyze) + logger.warning( + "analysis flag is not set: '%s' only a database will be created", analyze + ) return if not label: label = get_report_label(format_report) @@ -262,10 +284,21 @@ def main(): default=False, ) parser.add_argument( - "--txt", "--text", help="Output text instead of HTML", action="store_true" + "--txt", + "--text", + help="Output text instead of HTML", + action="store_true", ) parser.add_argument( - "--denylist", help="Use configured denylist", action="store_true" + "--legacy", + "-l", + help="Output legacy HTML", + action="store_true", + ) + parser.add_argument( + "--denylist", + help="Use configured denylist", + action="store_true", ) parser.add_argument( "--rogues", @@ -341,7 +374,12 @@ def main(): ) if analysis: handle_output( - analysis.analysis_results, args.txt, args.rogues, args.heroes, rogueconfig + analysis_results=analysis.analysis_results, + txtout=args.txt, + legacy=args.legacy, + rogues=args.rogues, + heroes=args.heroes, + rogueconfig=rogueconfig, ) output_time(start_time) logger.info("demystify: ...analysis complete") diff --git a/src/demystify/i18n/internationalstrings.py b/src/demystify/i18n/internationalstrings.py index c7bf2a4..6b9a9f6 100644 --- a/src/demystify/i18n/internationalstrings.py +++ b/src/demystify/i18n/internationalstrings.py @@ -12,7 +12,7 @@ class AnalysisStringsEN: REPORT_TOOL = "Analysis Tool" NAMESPACES = "Namespaces Used" - REPORT_MORE_INFORMATION = "More Detail:" + REPORT_MORE_INFORMATION = "More Detail" COUNT_TEXT = " Counts are shown for each entry in round () brackets." @@ -315,8 +315,8 @@ class AnalysisStringsEN: ) SUMMARY_DESC_EXTENSION_ID = ( - 'Files that can only be identified by their extension (e.g. ".DOC" might be a Microsoft Word ' - 'file, ".MP3" might be an audio file) This is a sub-set of the "Total unidentified files".' + "Files that can only be identified by their extension (e.g. '.DOC' might be a Microsoft Word " + "file, '.MP3' might be an audio file) This is a sub-set of the 'Total unidentified files'." ) SUMMARY_DESC_EXTENSION_MISMATCH = "This is the total number of cases where the extension used does not match with the file signature." diff --git a/src/demystify/libs/DemystifyAnalysisClass.py b/src/demystify/libs/DemystifyAnalysisClass.py index 50b7248..b54f10d 100644 --- a/src/demystify/libs/DemystifyAnalysisClass.py +++ b/src/demystify/libs/DemystifyAnalysisClass.py @@ -232,7 +232,7 @@ def list_duplicate_files_from_hash(self): # There is a potential or empty checksums in the database, # either through incorrectly writing the data or rogue # datasets, avoid processing and outputting those here. - if checksum == "": + if checksum in ("", None): continue self.analysis_results.totalHASHduplicates = ( diff --git a/src/demystify/libs/outputhandlers/noclasshtml.py b/src/demystify/libs/outputhandlers/noclasshtml.py new file mode 100644 index 0000000..54b7039 --- /dev/null +++ b/src/demystify/libs/outputhandlers/noclasshtml.py @@ -0,0 +1,1302 @@ +# -*- coding: utf-8 -*- + +import html as html_lib +import logging +import re +from typing import Final + +try: + from src.demystify.libs import DemystifyAnalysisClass + from src.demystify.libs.AnalysisResultsClass import AnalysisResults +except ModuleNotFoundError: + # Needed to run from root dir. + from demystify.libs import DemystifyAnalysisClass + from demystify.libs.AnalysisResultsClass import AnalysisResults + + +try: + from i18n.internationalstrings import AnalysisStringsEN as strings +except ModuleNotFoundError: + try: + from src.demystify.i18n.internationalstrings import AnalysisStringsEN as strings + except ModuleNotFoundError: + from demystify.i18n.internationalstrings import AnalysisStringsEN as strings + +logger = logging.getLogger(__name__) + + +NONE_REPLACE_DEBUG = "Replacing 'None': A field in the database is null because there is no data, replacing at the presentation later..." + +contrast_switch: Final[ + str +] = """ + + +

+ +

+""" + + +def html_header(htm_string: str, analysis_results: AnalysisResults): + """Output the HTML header.""" + + htm_string = make_text(htm_string=htm_string, text="") + htm_string = make_text(htm_string=htm_string, text="") + htm_string = make_text(htm_string=htm_string, text="") + if analysis_results.tooltype != "droid": + htm_string = make_text( + htm_string=htm_string, text=f"{strings.REPORT_TITLE_SF}" + ) + else: + htm_string = make_text( + htm_string=htm_string, text=f"{strings.REPORT_TITLE_DR}" + ) + + htm_string = make_text( + htm_string=htm_string, + text="", + ) + htm_string = make_text( + htm_string=htm_string, + text="", + ) + + style = """ + + + + """ + htm_string = make_text(htm_string=htm_string, text=style) + htm_string = make_text(htm_string=htm_string, text="") + return htm_string + + +def remove_none(old_list: list, format_unknown: bool = False) -> list: + """Remove `None` from the given list of tuples.""" + new_list = [] + for item in old_list: + if item[1] == "None": + if not format_unknown: + new_list.append((item[0], "")) + continue + new_list.append((item[0], "Format name is unknown")) + else: + new_list.append((item[0], item[1])) + return new_list + + +def make_text(htm_string: str, text: str): + res = f"{htm_string}" + if isinstance(text, list): + for txt in text: + res = f"{res}{txt}
\n" + res = f"{res}\n" + return res + res = f"{res}{text}\n" + return res + + +def make_summary(text: str): + """Todo...""" + return f"
\n\n{strings.REPORT_MORE_INFORMATION}\n\n

\n{text}\n

\n
\n" + + +def make_list_item(title, content, value): + """Todo...""" + return f'
  • {content}: {value}
  • \n' + + +def printHTMLResults(self): + self.generateHTML() + return self.htmloutput + + +def split_id_results(puid): + identifier = puid[0].rsplit("(", 1)[0] + namespace = puid[0].split(" ", 1)[0] + patt = re.compile("(x-)?fmt\\/[0-9]+") # noqa + p = re.search(patt, identifier) + if p is not None: + p = p.span() + identifier = identifier[p[0] : p[1]] + else: + identifier = identifier.replace(namespace, "").strip() + identifier = identifier.split(",", 1)[0] + count = puid[0].rsplit("(", 1)[1].replace(")", "") + if ", None" in puid[0]: + logging.debug(NONE_REPLACE_DEBUG) + formatname = ( + puid[0] + .replace(namespace, "") + .replace("({})".format(count), "") + .replace("{}, ".format(identifier), "") + .replace(", None", "") + .strip(", ") + ) + if formatname == "": + formatname = identifier + return namespace, identifier, formatname, count + + +def output_meter(value: int, min: int, max: int) -> str: + """Todo...""" + return f" METER VISUALISATION AVAILABLE IN GOOGLE CHROME " + + +def make_offset_text(title: str, statistic: list): + """Generate offset text. + + Data input should look as follows: + + * ['id','basis','filename','filesize','offset'] + """ + + example = f"

    {statistic[0]}, {statistic[1]} e.g. {statistic[2]} filesize: {statistic[3]}. {statistic[4]} bytes
    \n" + return f"{title}: {example}" + + +def identifier_chart( + analysis_results: AnalysisResults, count_list: list, reverse_list=True +): + """Todo...""" + count_list.sort(key=lambda tup: tup[1], reverse=reverse_list) + htm_string = f"{output_heading(strings.HEADING_FREQUENCY_PUIDS_IDENTIFIED, strings.HEADING_DESC_FREQUENCY_PUIDS_IDENTIFIED)}" + htm_string = ( + f"{htm_string}" + f"" + f"" + f"" + ) + for sig in count_list: + htm_string = f"{htm_string}" + meter = output_meter(sig[1], 0, analysis_results.filecount) + htm_string = f"{htm_string}{meter}" + htm_string = f"{htm_string}" + htm_string = f"{htm_string}
    {strings.COLUMN_HEADER_VALUES_ID}{strings.COLUMN_HEADER_VALUES_COUNT}{strings.COLUMN_HEADER_VALUES_YEAR}
    " + if "fmt/" in sig[0]: + url = f"{sig[0]}" + htm_string = f"\n{htm_string}{url}\n" + else: + htm_string = make_text(htm_string, sig[0]) + htm_string = f"{htm_string}{str(sig[1]).strip()}
    " + return htm_string + + +def namespace_stats(analysis_results: AnalysisResults): + """Todo...""" + # e.g.{'binary method count': '57', 'text method count': '37', 'namespace title': 'freedesktop.org', + # 'filename method count': '45', 'namespace details': 'freedesktop.org.xml'} + + nsdatalist = analysis_results.nsdatalist + signaturefrequency = analysis_results.signatureidentifiedfrequency + + try: + demystify = DemystifyAnalysisClass.DemystifyBase() + except DemystifyAnalysisClass.AnalysisError: + logging.error( + "There shouldn't be a new DemystifyAnalysis object here: not performing NS work..." + ) + return + + htm_string = "" + + for ns in nsdatalist: + signatureids = signaturefrequency + nstitle = ns[demystify.NS_CONST_TITLE] + identified = ns[demystify.NS_CONST_BINARY_COUNT] + xmlid = ns[demystify.NS_CONST_XML_COUNT] + text = ns[demystify.NS_CONST_TEXT_COUNT] + filename = ns[demystify.NS_CONST_FILENAME_COUNT] + ext = ns[demystify.NS_CONST_EXTENSION_COUNT] + unidentified = analysis_results.filecount - identified + percent_not = demystify.calculatePercent( + analysis_results.filecount, unidentified + ) + percent_ok = demystify.calculatePercent(analysis_results.filecount, identified) + + list_val = make_list_item( + strings.HEADING_DESC_NAMESPACE, + f"{strings.HEADING_NAMESPACE}", + f"{nstitle} ({ns[demystify.NS_CONST_DETAILS]})", + ) + + htm_string = f"{htm_string}" + + nslist = [] + for idrow in signatureids: + if idrow[0] == nstitle: + nslist.append(idrow[1:]) + table = output_table( + listing=nslist, + heading=None, + description=None, + count=True, + maxcolumns=2, + ) + htm_string = f"{htm_string}{table}" + return htm_string + + +def remove_namespace_id(old_list: str): + """Todo...""" + new_list = [] + for item in old_list: + new_list.append(str(item[0])) + return new_list + + +def output_heading(heading, description): + """Output an analysis heading and its detailed description.""" + htm_string = f"

    {heading}

    " + htm_string = f"{htm_string}{make_summary(description)}\n" + return htm_string + + +def output_table( + listing: list, + heading: str, + description: str, + count: bool = True, + maxcolumns: int = 5, +): + htm_string = "" + if heading is not None and description is not None: + htm_string = output_heading(heading, description) + list_len = len(listing) + rows = int(list_len / 5) + if list_len % 5 > 0: + rows = rows + 1 + rowno = 0 + colno = 0 + htm_string = f"{htm_string}" + for item in listing: + value = "" + if ", None" in item: + logging.debug( + "replacing `None` a field in the database is null", NONE_REPLACE_DEBUG + ) + item = item.replace(", None", "") + if isinstance(item, str): + value = item + elif len(item) == 1: + value = str(item[0]) + elif count is False: + if item[1] == "": + value = f"{item[0]}, Format name is unknown" + else: + value = f"{item[0]}, {item[1]}" + else: + value = f"{item[0]} ({item[1]})" + if colno < maxcolumns: + htm_string = f"{htm_string}" + colno = colno + 1 + else: + rowno = rowno + 1 + htm_string = f"{htm_string}" + htm_string = f"{htm_string}" + colno = 1 + + if not colno < maxcolumns: + htm_string = f"{htm_string}
    {value}
    {value}
    " + return htm_string + for td in range(maxcolumns - colno): + htm_string = f"{htm_string} " + htm_string = f"{htm_string}" + return htm_string + + +def report_metadata(analysis_results: AnalysisResults): + """Output report metadata.""" + htm_string = "" + if analysis_results.tooltype != "droid": + htm_string = make_text( + htm_string, + f"

    {strings.REPORT_TITLE_SF}

    \n", + ) + else: + htm_string = make_text( + htm_string, + f"

    {strings.REPORT_TITLE_DR}

    \n", + ) + htm_string = make_text( + htm_string, + f"{strings.REPORT_VERSION}: {analysis_results.__version__()}
    \n", + ) + + htm_string = make_text( + htm_string, + f"{strings.REPORT_FILE}: {analysis_results.filename}
    \n", + ) + + htm_string = make_text( + htm_string, + f"{strings.REPORT_TOOL}: {analysis_results.tooltype}
    \n", + ) + + htm_string = make_text( + htm_string, + f"{strings.NAMESPACES}: {analysis_results.namespacecount}
    \n", + ) + + return htm_string + + +def report_distance_scanned(analysis_results: AnalysisResults) -> str: + """For Siegfried and to understand how much data we're scanning + when identifying files, we can output the number of bytes + read. This can be useful for signature development.""" + if not analysis_results.bof_distance: + return "" + htm_string = make_offset_text( + title=strings.SUMMARY_DISTANCE_BOF, statistic=analysis_results.bof_distance + ) + if not analysis_results.eof_distance: + return htm_string + offset_text = make_offset_text( + title=strings.SUMMARY_DISTANCE_EOF, statistic=analysis_results.eof_distance + ) + htm_string = f"{htm_string}
    {offset_text}" + return htm_string + + +def non_droid_id_type_summary(analysis_results: AnalysisResults): + """Todo...""" + htm_string = "" + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_XML_ID, + strings.SUMMARY_XML_ID, + analysis_results.xmlidfilecount, + ), + ) + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_TEXT_ID, + strings.SUMMARY_TEXT_ID, + analysis_results.textidfilecount, + ), + ) + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_FILENAME_ID, + strings.SUMMARY_FILENAME_ID, + analysis_results.filenameidfilecount, + ), + ) + return htm_string + + +def non_droid_id_type_detailed_summary(analysis_results: AnalysisResults): + """Todo...""" + htm_string = "" + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_OTHER_ID_COUNT, + strings.SUMMARY_OTHER_ID_COUNT, + analysis_results.distinctOtherIdentifiers, + ), + ) + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_XML_ID_COUNT, + strings.SUMMARY_XML_ID_COUNT, + analysis_results.distinctXMLIdentifiers, + ), + ) + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_TEXT_ID_COUNT, + strings.SUMMARY_TEXT_ID_COUNT, + analysis_results.distinctTextIdentifiers, + ), + ) + htm_string = make_text( + htm_string, + make_list_item( + strings.SUMMARY_DESC_FILENAME_ID_COUNT, + strings.SUMMARY_FILENAME_ID_COUNT, + analysis_results.distinctFilenameIdentifiers, + ), + ) + return htm_string + + +def report_summary(analysis_results: AnalysisResults) -> str: + """Todo...""" + + htm_string = "" + + htm_string = make_text( + htm_string, + "

    {}

    ".format(strings.REPORT_SUMMARY), + ) + + htm_string = make_text( + htm_string, + "", + ) + + return htm_string + + +def report_size(analysis_results: AnalysisResults) -> str: + """Output the size of the collection we have profiled. Size is + calculated as: MiB/MB = (2^1024)*2. + """ + htm_string = "" + htm_string = output_heading(strings.HEADING_SIZE, strings.HEADING_DESC_SIZE) + htm_string = make_text( + htm_string, + f"{float(analysis_results.collectionsize)} bytes | {round(float(float(analysis_results.collectionsize) / (1048576)), 1)} MiB/MB (Megabytes)", + ) + return htm_string + + +def report_identifiers(analysis_results: AnalysisResults) -> str: + """Todo...""" + if analysis_results.signatureidentifiers is None: + return "" + signature_id_list = [] + count_list = [] + for puid in analysis_results.signatureidentifiers: + namespace, identifier, formatname, count = split_id_results(puid) + count_list.append((identifier, int(count))) + signature_id_list.append((namespace, identifier, formatname, int(count))) + logger.info(len(count_list)) + chart = identifier_chart(analysis_results, count_list) + return chart + + +def report_format_classification(analysis_results: AnalysisResults) -> str: + """TOdo....""" + if not analysis_results.classifications_count > 0: + return "" + htm_string = output_heading( + strings.HEADING_CLASSIFICATION, strings.HEADING_DESC_CLASSIFICATION + ) + htm_string = ( + f"{htm_string}" + f"" + f"" + ) + for format_classification in analysis_results.classifications: + try: + classification = format_classification[0] + except IndexError as err: + logger.error("cannot access format classification: %s", err) + return "" + if classification.lower() == "none": + classification = "No format type classification" + htm_string = f"{htm_string}" + meter = output_meter(format_classification[1], 0, analysis_results.filecount) + htm_string = f"{htm_string}{meter}" + htm_string = f"{htm_string}
    {strings.COLUMN_HEADER_VALUES_CLASSIFICATION}{strings.COLUMN_HEADER_VALUES_COUNT}
    " + htm_string = f"{htm_string}{classification}" + htm_string = f"{htm_string}{format_classification[1]}
    \n" + return htm_string + + +def report_date_range(analysis_results: AnalysisResults) -> str: + """Todo...""" + if analysis_results.dateFrequency is None: + return "" + # Date Ranges + htm_string = output_heading( + strings.HEADING_DATE_RANGE, strings.HEADING_DESC_DATE_RANGE + ) + htm_string = f"{htm_string}" + + for dates in analysis_results.dateFrequency: + htm_string = f"{htm_string}" + meter = output_meter(dates[1], 0, analysis_results.filecount) + htm_string = f"{htm_string}{meter}" + htm_string = f"{htm_string}
    {strings.COLUMN_HEADER_VALUES_YEAR}{strings.COLUMN_HEADER_VALUES_COUNT} 
    " + htm_string = f"{htm_string}{dates[0]}" + htm_string = f"{htm_string}{dates[1]}
    " + return htm_string + + +def report_aggregated_ff_sigs(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if analysis_results.signatureidentifiers is None: + return "" + signature_id_list = [] + for puid in analysis_results.signatureidentifiers: + namespace, identifier, formatname, count = split_id_results(puid) + signature_id_list.append((namespace, identifier, formatname, int(count))) + if not signature_id_list: + return "" + # Signature identified PUIDs in collection (signature and container) + htm_string = output_heading( + strings.HEADING_AGGREGATE_BINARY_IDENTIFIED, strings.HEADING_DESC_IDENTIFIED + ) + htm_string = ( + f"{htm_string}" + f"" + f"" + f"" + f"" + ) + signature_id_list.sort(key=lambda keys: int(keys[3]), reverse=True) + # Tuple object: (namespace, identifier, format name, int(count)) + # + # For example: ('ns:pronom fmt/19, Acrobat PDF 1.5 - Portable Document Format, 1.5 (6)', 1) + # + for id_ in signature_id_list: + if "fmt/" in id_[1]: + markup = ( + f"\n" + ) + else: + markup = f"" + markup = f"{markup}" + htm_string = f"{htm_string}{markup}" + htm_string = f"{htm_string}
    {strings.COLUMN_HEADER_VALUES_ID}{strings.COLUMN_HEADER_VALUES_NAMESPACE}{strings.COLUMN_HEADER_VALUES_FORMAT}{strings.COLUMN_HEADER_VALUES_COUNT}
    \n" + f"{id_[1]}
    {id_[1]}{id_[0]}{id_[2]}{id_[3]}
    " + return htm_string + + +def report_aggregate_binary(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.binaryidentifiers: + return "" + new_list = remove_namespace_id(analysis_results.binaryidentifiers) + return output_table( + listing=new_list, + heading=strings.HEADING_BINARY_ID, + description=strings.HEADING_DESC_BINARY_ID, + count=True, + maxcolumns=1, + ) + + +def report_aggregate_xml(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.xmlidentifiers: + return "" + new_list = remove_namespace_id(analysis_results.xmlidentifiers) + return output_table( + listing=new_list, + heading=strings.HEADING_XML_ID, + description=strings.HEADING_DESC_XML_ID, + count=True, + maxcolumns=1, + ) + + +def report_aggregate_text(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.textidentifiers: + return "" + new_list = remove_namespace_id(analysis_results.textidentifiers) + return output_table( + listing=new_list, + heading=strings.HEADING_TEXT_ID, + description=strings.HEADING_DESC_TEXT_ID, + count=True, + maxcolumns=1, + ) + + +def report_aggregate_filename(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.filenameidentifiers: + return "" + new_list = remove_namespace_id(analysis_results.filenameidentifiers) + return output_table( + listing=new_list, + heading=strings.HEADING_FILENAME_ID, + description=strings.HEADING_DESC_FILENAME_ID, + count=True, + maxcolumns=1, + ) + + +def report_id_method_frequency(analysis_results: AnalysisResults) -> str: + """TODO...""" + if not analysis_results.idmethodFrequency: + return "" + return output_table( + listing=analysis_results.idmethodFrequency, + heading=strings.HEADING_ID_METHOD, + description=strings.HEADING_DESC_ID_METHOD, + ) + + +def report_extension_only(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.extensionOnlyIDList: + return "" + new_list = remove_none(analysis_results.extensionOnlyIDList) + return output_table( + listing=new_list, + heading=strings.HEADING_EXTENSION_ONLY, + description=strings.HEADING_DESC_EXTENSION_ONLY, + count=False, + maxcolumns=2, + ) + + +def report_frequency_extension_only(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.extensionOnlyIDList: + return "" + if not len(analysis_results.extensionOnlyIDList) > 0: + return "" + # Extension Only Identification + extlist = analysis_results.extensionOnlyIDList + for item in list(extlist): + if "unknown" in item[0].lower(): + extlist.remove(item) + continue + extlist = remove_none(old_list=extlist, format_unknown=True) + if analysis_results.tooltype != "droid": + # we have basis information so need a bigger table... + return output_table( + listing=extlist, + heading=strings.HEADING_FREQUENCY_EXTENSION_ONLY, + description=strings.HEADING_DESC_FREQUENCY_EXTENSION_ONLY, + count=True, + maxcolumns=2, + ) + return output_table( + listing=extlist, + heading=strings.HEADING_FREQUENCY_EXTENSION_ONLY, + description=strings.HEADING_DESC_FREQUENCY_EXTENSION_ONLY, + count=True, + maxcolumns=3, + ) + + +def report_frequency_all_extensions(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.frequencyOfAllExtensions: + return "" + return output_table( + listing=analysis_results.frequencyOfAllExtensions, + heading=strings.HEADING_FREQUENCY_EXTENSIONS_ALL, + description=strings.HEADING_DESC_FREQUENCY_EXTENSIONS_ALL, + ) + + +def report_all_unique_extensions(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.uniqueExtensionsInCollectionList: + return + return output_table( + listing=analysis_results.uniqueExtensionsInCollectionList, + heading=strings.HEADING_UNIQUE_EXTENSIONS, + description=strings.HEADING_DESC_UNIQUE_EXTENSIONS, + count=False, + ) + + +def report_multiple_identification(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.rogue_multiple_identification_list: + return "" + if not len(analysis_results.rogue_multiple_identification_list) > 0: + return "" + return output_table( + listing=analysis_results.rogue_multiple_identification_list, + heading=strings.HEADING_LIST_MULTIPLE, + description=strings.HEADING_DESC_LIST_MULTIPLE, + count=False, + maxcolumns=1, + ) + + +def report_mimetypes(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.mimetypeFrequency: + return "" + mimes = analysis_results.mimetypeFrequency + for mime in list(mimes): + if mime[0] != "": + continue + mimes.remove(mime) + return output_table( + listing=mimes, + heading=strings.HEADING_FREQUENCY_MIME, + description=strings.HEADING_DESC_FREQUENCY_MIME, + count=True, + maxcolumns=2, + ) + + +def report_results_per_identifier(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.signatureidentifiedfrequency: + return "" + if not analysis_results.nsdatalist: + return "" + header = output_heading( + strings.HEADING_NAMESPACE_SPECIFIC_STATISTICS, + strings.HEADING_DESC_NAMESPACE_SPECIFIC_STATISTICS, + ) + namespace_details = namespace_stats( + analysis_results=analysis_results, + ) + return f"{header}\n{namespace_details}\n" + + +def report_all_xml_identifiers(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.xml_identifiers: + return "" + if not len(analysis_results.xml_identifiers) > 0: + return "" + return output_table( + listing=analysis_results.xml_identifiers, + heading=strings.HEADING_XML_ID_COMPLETE, + description=strings.HEADING_DESC_XML_ID_COMPLETE, + count=True, + maxcolumns=1, + ) + + +def report_all_text_identifiers(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.text_identifiers: + return "" + if not analysis_results.text_identifiers: + return "" + return output_table( + listing=analysis_results.text_identifiers, + heading=strings.HEADING_TEXT_ID_COMPLETE, + description=strings.HEADING_DESC_TEXT_ID_COMPLETE, + count=True, + maxcolumns=1, + ) + + +def report_all_filename_identifiers(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.filename_identifiers: + return "" + if not len(analysis_results.filename_identifiers) > 0: + return "" + return output_table( + listing=analysis_results.filename_identifiers, + heading=strings.HEADING_FILENAME_ID_COMPLETE, + description=strings.HEADING_DESC_FILENAME_ID_COMPLETE, + count=True, + maxcolumns=1, + ) + + +def report_zero_byte_files(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.zerobytelist: + return "" + if not len(analysis_results.zerobytelist): + return "" + return output_table( + listing=analysis_results.zerobytelist, + heading=strings.HEADING_LIST_ZERO_BYTES, + description=strings.HEADING_DESC_LIST_ZERO_BYTES, + count=False, + maxcolumns=1, + ) + + +def report_aggregate_file_types(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.containertypeslist: + return "" + if not len(analysis_results.containertypeslist) > 0: + return "" + return output_table( + listing=analysis_results.containertypeslist, + heading=strings.HEADING_ARCHIVE_FORMATS, + description=strings.HEADING_DESC_ARCHIVE_FORMATS, + count=False, + ) + + +def report_duplicates(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.hashused: + return "" + if not analysis_results.duplicateHASHlisting: + return "" + htm_string = output_heading( + heading=f"{strings.HEADING_IDENTICAL_CONTENT} ({analysis_results.totalHASHduplicates})", + description=f"{strings.HEADING_DESC_IDENTICAL_CONTENT} ({analysis_results.totalHASHduplicates})", + ) + for dupes in analysis_results.duplicateHASHlisting: + htm_string = ( + f"{htm_string}{dupes['checksum']} Count: {dupes['count']}

    \n" + ) + htm_string = f"{htm_string}
    "
    +        for example in dupes["examples"]:
    +            try:
    +                text = f"{example.decode('UTF-8')}"
    +            except (AttributeError, UnicodeEncodeError):
    +                text = f"{example}"
    +            htm_string = f"{htm_string}{html_lib.escape(text)}\n"
    +        htm_string = f"{htm_string}

    \n" + return htm_string + + +def report_non_ascii_file_names(analysis_results: AnalysisResults) -> str: + """Todo...""" + + if not analysis_results.badFileNames: + return "" + if not len(analysis_results.badFileNames) > 0: + return "" + htm_string = output_heading( + strings.HEADING_TROUBLESOME_FILENAMES, + strings.HEADING_DESC_TROUBLESOME_FILENAMES, + ) + for fname in analysis_results.badFileNames: + fname = html_lib.escape(fname) + fname = fname.replace("File:", "File:") + htm_string = f"{htm_string}{fname}
    \n" + return htm_string + + +def report_non_ascii_directory_names(analysis_results: AnalysisResults) -> str: + """Todo....""" + if not analysis_results.badDirNames: + return "" + if not len(analysis_results.badDirNames) > 0: + return "" + htm_string = output_heading( + strings.HEADING_TROUBLESOME_DIRNAMES, + strings.HEADING_DESC_TROUBLESOME_DIRNAMES, + ) + for fname in analysis_results.badDirNames: + fname = html_lib.escape(fname) + fname = fname.replace("Directory:", "Directory:") + htm_string = f"{htm_string}{fname}
    \n" + return htm_string + + +def report_denylist_ids(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.denylist: + return "" + if not analysis_results.denylist_ids: + return "" + return output_table( + listing=analysis_results.denylist_ids, + heading=strings.HEADING_DENYLIST_IDS, + description=strings.HEADING_DESC_DENYLIST, + count=True, + maxcolumns=1, + ) + + +def report_denylist_extensions(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.denylist: + return "" + if not analysis_results.denylist_exts: + return "" + return output_table( + listing=analysis_results.denylist_exts, + heading=strings.HEADING_DENYLIST_EXTS, + description=strings.HEADING_DESC_DENYLIST, + count=True, + maxcolumns=3, + ) + + +def report_denylist_filenames(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.denylist: + return "" + if not analysis_results.denylist_filenames: + return "" + return output_table( + listing=analysis_results.denylist_filenames, + heading=strings.HEADING_DENYLIST_FILENAMES, + description=strings.HEADING_DESC_DENYLIST, + count=True, + maxcolumns=1, + ) + + +def report_denylist_directories(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.denylist: + return "" + if not analysis_results.denylist_directories: + return "" + return output_table( + listing=analysis_results.denylist_directories, + heading=strings.HEADING_DENYLIST_DIRS, + description=strings.HEADING_DESC_DENYLIST, + count=True, + maxcolumns=1, + ) + + +def report_error_list(analysis_results: AnalysisResults) -> str: + """Todo...""" + if not analysis_results.errorlist: + return "" + return output_table( + listing=analysis_results.errorlist, + heading=strings.HEADING_ERRORS, + description=strings.HEADING_DESC_ERRORS, + count=True, + maxcolumns=1, + ) + + +SECTIONS = [ + report_metadata, + report_distance_scanned, + report_summary, + report_size, + report_identifiers, + report_format_classification, + report_date_range, + report_aggregated_ff_sigs, + report_aggregate_binary, + report_aggregate_xml, + report_aggregate_text, + report_aggregate_filename, + report_id_method_frequency, + report_extension_only, + report_frequency_extension_only, + report_frequency_all_extensions, + report_all_unique_extensions, + report_multiple_identification, + report_mimetypes, + report_results_per_identifier, + report_all_xml_identifiers, + report_all_text_identifiers, + report_all_filename_identifiers, + report_zero_byte_files, + report_aggregate_file_types, + report_duplicates, + report_non_ascii_file_names, + report_non_ascii_directory_names, + report_denylist_ids, + report_denylist_extensions, + report_denylist_filenames, + report_denylist_directories, + report_error_list, +] + + +def html_body(htm_string: str, analysis_results: AnalysisResults): + """Output the HTML body.""" + + htm_string = make_text( + htm_string, + "\n
    \n", + ) + + for section in SECTIONS: + analysis_htm = section(analysis_results) + if analysis_htm == "": + continue + htm_string = f"{htm_string}{analysis_htm}" + htm_string = f"{htm_string}\n
    \n" + + htm_string = make_text( + htm_string, + "
    \n\n", + ) + + return htm_string + + +def html(analysis_results: AnalysisResults): + """Output a HTML report.""" + + logger.info("outputting clean html") + + htm_string = "" + + htm_string = html_header(htm_string, analysis_results) + htm_string = html_body(htm_string, analysis_results) + + return htm_string diff --git a/tox.ini b/tox.ini index db29427..f2740fb 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ commands = pre-commit run --all-files --show-diff-on-failure [testenv:coverage] deps = -r requirements/local.txt -commands = pytest --cov=src.jsonid tests/ +commands = pytest --cov=src.demystify tests/ [flake8] exclude =