}. - - Call C{withAttribute} with a series of attribute names and values. Specify the list - of filter attributes names and values as: - - keyword arguments, as in C{(align="right")}, or - - as an explicit dict with C{**} operator, when an attribute name is also a Python - reserved word, as in C{**{"class":"Customer", "align":"right"}} - - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) - For attribute names with a namespace prefix, you must use the second form. Attribute - names are matched insensitive to upper/lower case. - - To verify that the attribute exists, but without specifying a value, pass - C{withAttribute.ANY_VALUE} as the value. - """ - if args: - attrs = args[:] - else: - attrs = attrDict.items() - attrs = [(k,v) for k,v in attrs] - def pa(s,l,tokens): - for attrName,attrValue in attrs: - if attrName not in tokens: - raise ParseException(s,l,"no matching attribute " + attrName) - if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue: - raise ParseException(s,l,"attribute '%s' has value '%s', must be '%s'" % - (attrName, tokens[attrName], attrValue)) - return pa -withAttribute.ANY_VALUE = object() - -opAssoc = _Constants() -opAssoc.LEFT = object() -opAssoc.RIGHT = object() - -def infixNotation( baseExpr, opList, lpar=Suppress('('), rpar=Suppress(')') ): - """Helper method for constructing grammars of expressions made up of - operators working in a precedence hierarchy. Operators may be unary or - binary, left- or right-associative. Parse actions can also be attached - to operator expressions. - - Parameters: - - baseExpr - expression representing the most basic element for the nested - - opList - list of tuples, one for each operator precedence level in the - expression grammar; each tuple is of the form - (opExpr, numTerms, rightLeftAssoc, parseAction), where: - - opExpr is the pyparsing expression for the operator; - may also be a string, which will be converted to a Literal; - if numTerms is 3, opExpr is a tuple of two expressions, for the - two operators separating the 3 terms - - numTerms is the number of terms for this operator (must - be 1, 2, or 3) - - rightLeftAssoc is the indicator whether the operator is - right or left associative, using the pyparsing-defined - constants C{opAssoc.RIGHT} and C{opAssoc.LEFT}. - - parseAction is the parse action to be associated with - expressions matching this operator expression (the - parse action tuple member may be omitted) - - lpar - expression for matching left-parentheses (default=Suppress('(')) - - rpar - expression for matching right-parentheses (default=Suppress(')')) - """ - ret = Forward() - lastExpr = baseExpr | ( lpar + ret + rpar ) - for i,operDef in enumerate(opList): - opExpr,arity,rightLeftAssoc,pa = (operDef + (None,))[:4] - if arity == 3: - if opExpr is None or len(opExpr) != 2: - raise ValueError("if numterms=3, opExpr must be a tuple or list of two expressions") - opExpr1, opExpr2 = opExpr - thisExpr = Forward()#.setName("expr%d" % i) - if rightLeftAssoc == opAssoc.LEFT: - if arity == 1: - matchExpr = FollowedBy(lastExpr + opExpr) + Group( lastExpr + OneOrMore( opExpr ) ) - elif arity == 2: - if opExpr is not None: - matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( lastExpr + OneOrMore( opExpr + lastExpr ) ) - else: - matchExpr = FollowedBy(lastExpr+lastExpr) + Group( lastExpr + OneOrMore(lastExpr) ) - elif arity == 3: - matchExpr = FollowedBy(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) + \ - Group( lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr ) - else: - raise ValueError("operator must be unary (1), binary (2), or ternary (3)") - elif rightLeftAssoc == opAssoc.RIGHT: - if arity == 1: - # try to avoid LR with this extra test - if not isinstance(opExpr, Optional): - opExpr = Optional(opExpr) - matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( opExpr + thisExpr ) - elif arity == 2: - if opExpr is not None: - matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( lastExpr + OneOrMore( opExpr + thisExpr ) ) - else: - matchExpr = FollowedBy(lastExpr + thisExpr) + Group( lastExpr + OneOrMore( thisExpr ) ) - elif arity == 3: - matchExpr = FollowedBy(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) + \ - Group( lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr ) - else: - raise ValueError("operator must be unary (1), binary (2), or ternary (3)") - else: - raise ValueError("operator must indicate right or left associativity") - if pa: - matchExpr.setParseAction( pa ) - thisExpr <<= ( matchExpr | lastExpr ) - lastExpr = thisExpr - ret <<= lastExpr - return ret -operatorPrecedence = infixNotation - -dblQuotedString = Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*"').setName("string enclosed in double quotes") -sglQuotedString = Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*'").setName("string enclosed in single quotes") -quotedString = Regex(r'''(?:"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*")|(?:'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*')''').setName("quotedString using single or double quotes") -unicodeString = Combine(_L('u') + quotedString.copy()) - -def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()): - """Helper method for defining nested lists enclosed in opening and closing - delimiters ("(" and ")" are the default). - - Parameters: - - opener - opening character for a nested list (default="("); can also be a pyparsing expression - - closer - closing character for a nested list (default=")"); can also be a pyparsing expression - - content - expression for items within the nested lists (default=None) - - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) - - If an expression is not provided for the content argument, the nested - expression will capture all whitespace-delimited content between delimiters - as a list of separate values. - - Use the C{ignoreExpr} argument to define expressions that may contain - opening or closing characters that should not be treated as opening - or closing characters for nesting, such as quotedString or a comment - expression. Specify multiple expressions using an C{L{Or}} or C{L{MatchFirst}}. - The default is L{quotedString}, but if no expressions are to be ignored, - then pass C{None} for this argument. - """ - if opener == closer: - raise ValueError("opening and closing strings cannot be the same") - if content is None: - if isinstance(opener,basestring) and isinstance(closer,basestring): - if len(opener) == 1 and len(closer)==1: - if ignoreExpr is not None: - content = (Combine(OneOrMore(~ignoreExpr + - CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) - else: - content = (empty.copy()+CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS - ).setParseAction(lambda t:t[0].strip())) - else: - if ignoreExpr is not None: - content = (Combine(OneOrMore(~ignoreExpr + - ~Literal(opener) + ~Literal(closer) + - CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) - else: - content = (Combine(OneOrMore(~Literal(opener) + ~Literal(closer) + - CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) - ).setParseAction(lambda t:t[0].strip())) - else: - raise ValueError("opening and closing arguments must be strings if no content expression is given") - ret = Forward() - if ignoreExpr is not None: - ret <<= Group( Suppress(opener) + ZeroOrMore( ignoreExpr | ret | content ) + Suppress(closer) ) - else: - ret <<= Group( Suppress(opener) + ZeroOrMore( ret | content ) + Suppress(closer) ) - return ret - -def indentedBlock(blockStatementExpr, indentStack, indent=True): - """Helper method for defining space-delimited indentation blocks, such as - those used to define block statements in Python source code. - - Parameters: - - blockStatementExpr - expression defining syntax of statement that - is repeated within the indented block - - indentStack - list created by caller to manage indentation stack - (multiple statementWithIndentedBlock expressions within a single grammar - should share a common indentStack) - - indent - boolean indicating whether block must be indented beyond the - the current level; set to False for block of left-most statements - (default=True) - - A valid block must contain at least one C{blockStatement}. - """ - def checkPeerIndent(s,l,t): - if l >= len(s): return - curCol = col(l,s) - if curCol != indentStack[-1]: - if curCol > indentStack[-1]: - raise ParseFatalException(s,l,"illegal nesting") - raise ParseException(s,l,"not a peer entry") - - def checkSubIndent(s,l,t): - curCol = col(l,s) - if curCol > indentStack[-1]: - indentStack.append( curCol ) - else: - raise ParseException(s,l,"not a subentry") - - def checkUnindent(s,l,t): - if l >= len(s): return - curCol = col(l,s) - if not(indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): - raise ParseException(s,l,"not an unindent") - indentStack.pop() - - NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()) - INDENT = Empty() + Empty().setParseAction(checkSubIndent) - PEER = Empty().setParseAction(checkPeerIndent) - UNDENT = Empty().setParseAction(checkUnindent) - if indent: - smExpr = Group( Optional(NL) + - #~ FollowedBy(blockStatementExpr) + - INDENT + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) + UNDENT) - else: - smExpr = Group( Optional(NL) + - (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) ) - blockStatementExpr.ignore(_bslash + LineEnd()) - return smExpr - -alphas8bit = srange(r"[\0xc0-\0xd6\0xd8-\0xf6\0xf8-\0xff]") -punc8bit = srange(r"[\0xa1-\0xbf\0xd7\0xf7]") - -anyOpenTag,anyCloseTag = makeHTMLTags(Word(alphas,alphanums+"_:")) -commonHTMLEntity = Combine(_L("&") + oneOf("gt lt amp nbsp quot").setResultsName("entity") +";").streamline() -_htmlEntityMap = dict(zip("gt lt amp nbsp quot".split(),'><& "')) -replaceHTMLEntity = lambda t : t.entity in _htmlEntityMap and _htmlEntityMap[t.entity] or None - -# it's easy to get these comment structures wrong - they're very common, so may as well make them available -cStyleComment = Regex(r"/\*(?:[^*]*\*+)+?/").setName("C style comment") - -htmlComment = Regex(r"") -restOfLine = Regex(r".*").leaveWhitespace() -dblSlashComment = Regex(r"\/\/(\\\n|.)*").setName("// comment") -cppStyleComment = Regex(r"/(?:\*(?:[^*]*\*+)+?/|/[^\n]*(?:\n[^\n]*)*?(?:(?" + str(tokenlist)) - print ("tokens = " + str(tokens)) - print ("tokens.columns = " + str(tokens.columns)) - print ("tokens.tables = " + str(tokens.tables)) - print (tokens.asXML("SQL",True)) - except ParseBaseException as err: - print (teststring + "->") - print (err.line) - print (" "*(err.column-1) + "^") - print (err) - print() - - selectToken = CaselessLiteral( "select" ) - fromToken = CaselessLiteral( "from" ) - - ident = Word( alphas, alphanums + "_$" ) - columnName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) - columnNameList = Group( delimitedList( columnName ) )#.setName("columns") - tableName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) - tableNameList = Group( delimitedList( tableName ) )#.setName("tables") - simpleSQL = ( selectToken + \ - ( '*' | columnNameList ).setResultsName( "columns" ) + \ - fromToken + \ - tableNameList.setResultsName( "tables" ) ) - - test( "SELECT * from XYZZY, ABC" ) - test( "select * from SYS.XYZZY" ) - test( "Select A from Sys.dual" ) - test( "Select AA,BB,CC from Sys.dual" ) - test( "Select A, B, C from Sys.dual" ) - test( "Select A, B, C from Sys.dual" ) - test( "Xelect A, B, C from Sys.dual" ) - test( "Select A, B, C frox Sys.dual" ) - test( "Select" ) - test( "Select ^^^ frox Sys.dual" ) - test( "Select A, B, C from Sys.dual, Table2 " ) diff -Nru remnux-oletools-0.51a/oletools/thirdparty/pyparsing/README remnux-oletools-0.51a/oletools/thirdparty/pyparsing/README --- remnux-oletools-0.51a/oletools/thirdparty/pyparsing/README 2016-11-04 21:28:21.000000000 +0000 +++ remnux-oletools-0.51a/oletools/thirdparty/pyparsing/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,72 +0,0 @@ -==================================== -PyParsing -- A Python Parsing Module -==================================== - -Introduction -============ - -The pyparsing module is an alternative approach to creating and executing -simple grammars, vs. the traditional lex/yacc approach, or the use of -regular expressions. The pyparsing module provides a library of classes -that client code uses to construct the grammar directly in Python code. - -Here is a program to parse "Hello, World!" (or any greeting of the form -", !"): - - from pyparsing import Word, alphas - greet = Word( alphas ) + "," + Word( alphas ) + "!" - hello = "Hello, World!" - print hello, "->", greet.parseString( hello ) - -The program outputs the following: - - Hello, World! -> ['Hello', ',', 'World', '!'] - -The Python representation of the grammar is quite readable, owing to the -self-explanatory class names, and the use of '+', '|' and '^' operator -definitions. - -The parsed results returned from parseString() can be accessed as a -nested list, a dictionary, or an object with named attributes. - -The pyparsing module handles some of the problems that are typically -vexing when writing text parsers: -- extra or missing whitespace (the above program will also handle - "Hello,World!", "Hello , World !", etc.) -- quoted strings -- embedded comments - -The .zip file includes examples of a simple SQL parser, simple CORBA IDL -parser, a config file parser, a chemical formula parser, and a four- -function algebraic notation parser. It also includes a simple how-to -document, and a UML class diagram of the library's classes. - - - -Installation -============ - -Do the usual: - - python setup.py install - -(pyparsing requires Python 2.3.2 or later.) - - -Documentation -============= - -See: - - HowToUsePyparsing.html - - -License -======= - - MIT License. See header of pyparsing.py - -History -======= - - See CHANGES file. diff -Nru remnux-oletools-0.51a/oletools/thirdparty/tablestream/tablestream.py remnux-oletools-0.51a/oletools/thirdparty/tablestream/tablestream.py --- remnux-oletools-0.51a/oletools/thirdparty/tablestream/tablestream.py 2016-11-04 21:28:21.000000000 +0000 +++ remnux-oletools-0.51a/oletools/thirdparty/tablestream/tablestream.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,396 +0,0 @@ -#!/usr/bin/env python -""" -tablestream - -tablestream can format table data for pretty printing as text, -to be displayed on the console or written to any file-like object. -The table data can be provided as rows, each row is an iterable of -cells. The text in each cell is wrapped to fit into a maximum width -set for each column. -Contrary to many table pretty printing libraries, TableStream writes -each row to the output as soon as it is provided, and the whole table -does not need to be built in memory before printing. -It is therefore suitable for large tables, or tables that take time to -be processed row by row. - -Author: Philippe Lagadec - http://www.decalage.info -License: BSD, see source code or documentation -""" - -#=== LICENSE ================================================================== - -# tablestream is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info) -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without modification, -# are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from __future__ import print_function - -#------------------------------------------------------------------------------ -# CHANGELOG: -# 2015-11-01 v0.01 PL: - first version -# 2016-01-01 v0.02 PL: - added styles, color support -# 2016-04-19 v0.03 PL: - enable colorclass on Windows, fixed issue #39 -# 2016-05-25 v0.04 PL: - updated for colorclass 2.2.0 (now a package) -# 2016-07-29 v0.05 PL: - fixed oletools issue #57, bug when importing colorclass -# 2016-07-31 v0.06 PL: - handle newline characters properly in each cell -# 2016-08-28 v0.07 PL: - support for both Python 2.6+ and 3.x -# - all cells are converted to unicode - -__version__ = '0.07' - -#------------------------------------------------------------------------------ -# TODO: -# - several styles -# - colorized rows or cells -# - automatic width for the last column, based on max total width -# - automatic width for selected columns, based on N first lines -# - determine the console width - -# === IMPORTS ================================================================= - -import textwrap -import sys, os - -# add the thirdparty subfolder to sys.path (absolute+normalized path): -_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) -# print('_thismodule_dir = %r' % _thismodule_dir) -# assumption: this module is in a subfolder of thirdparty: -_thirdparty_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) -# print('_thirdparty_dir = %r' % _thirdparty_dir) -if not _thirdparty_dir in sys.path: - sys.path.insert(0, _thirdparty_dir) - -import colorclass - -# On Windows, colorclass needs to be enabled: -if os.name == 'nt': - colorclass.Windows.enable(auto_colors=True) - - -# === PYTHON 2+3 SUPPORT ====================================================== - -if sys.version_info[0] >= 3: - # Python 3 specific adaptations - # py3 range = py2 xrange - xrange = range - ustr = str - # byte strings for to_ustr (with py3, bytearray supports encoding): - byte_strings = (bytes, bytearray) -else: - # Python 2 specific adaptations - ustr = unicode - # byte strings for to_ustr (with py2, bytearray does not support encoding): - byte_strings = bytes - - -# === FUNCTIONS ============================================================== - -def to_ustr(obj, encoding='utf8', errors='replace'): - """ - convert an object to unicode, using the appropriate method - :param obj: any object, str, bytes or unicode - :return: unicode string (ustr) - """ - # if the object is already unicode, return it unchanged: - if isinstance(obj, ustr): - return obj - # if it is a bytes string, decode it using the provided encoding - elif isinstance(obj, byte_strings): - return ustr(obj, encoding=encoding, errors=errors) - # else just convert it to unicode: - # (an exception is raised if we specify encoding in this case) - else: - return ustr(obj) - - - -# === CLASSES ================================================================= - - -class TableStyle(object): - """ - Style for a TableStream. - This base class can be derived to create new styles. - Default style: - +------+---+ - |Header| + - +------+---+ - | | | - +------+---+ - """ - # Header rows: - header_top = True - header_top_left = u'+' - header_top_horiz = u'-' - header_top_middle = u'+' - header_top_right = u'+' - - header_vertical_left = u'|' - header_vertical_middle = u'|' - header_vertical_right = u'|' - - # Separator line between header and normal rows: - header_sep = True - header_sep_left = u'+' - header_sep_horiz = u'-' - header_sep_middle = u'+' - header_sep_right = u'+' - - # Top row if there is no header: - noheader_top = True - noheader_top_left = u'+' - noheader_top_horiz = u'-' - noheader_top_middle = u'+' - noheader_top_right = u'+' - - # Normal rows - vertical_left = u'|' - vertical_middle = u'|' - vertical_right = u'|' - - # Separator line between rows: - sep = False - sep_left = u'+' - sep_horiz = u'-' - sep_middle = u'+' - sep_right = u'+' - - # Bottom line - bottom = True - bottom_left = u'+' - bottom_horiz = u'-' - bottom_middle = u'+' - bottom_right = u'+' - - -class TableStyleSlim(object): - """ - Style for a TableStream. - Example: - ------+--- - Header| - ------+--- - | - ------+--- - """ - # Header rows: - header_top = True - header_top_left = u'' - header_top_horiz = u'-' - header_top_middle = u'+' - header_top_right = u'' - - header_vertical_left = u'' - header_vertical_middle = u'|' - header_vertical_right = u'' - - # Separator line between header and normal rows: - header_sep = True - header_sep_left = u'' - header_sep_horiz = u'-' - header_sep_middle = u'+' - header_sep_right = u'' - - # Top row if there is no header: - noheader_top = True - noheader_top_left = u'' - noheader_top_horiz = u'-' - noheader_top_middle = u'+' - noheader_top_right = u'' - - # Normal rows - vertical_left = u'' - vertical_middle = u'|' - vertical_right = u'' - - # Separator line between rows: - sep = False - sep_left = u'' - sep_horiz = u'-' - sep_middle = u'+' - sep_right = u'' - - # Bottom line - bottom = True - bottom_left = u'' - bottom_horiz = u'-' - bottom_middle = u'+' - bottom_right = u'' - - - -class TableStream(object): - """ - a TableStream object can format table data for pretty printing as text, - to be displayed on the console or written to any file-like object. - The table data can be provided as rows, each row is an iterable of - cells. The text in each cell is wrapped to fit into a maximum width - set for each column. - Contrary to many table pretty printing libraries, TableStream writes - each row to the output as soon as it is provided, and the whole table - does not need to be built in memory before printing. - It is therefore suitable for large tables, or tables that take time to - be processed row by row. - """ - - def __init__(self, column_width, header_row=None, style=TableStyle, - outfile=sys.stdout, encoding_in='utf8', encoding_out='utf8'): - ''' - Constructor for class TableStream - :param column_width: tuple or list containing the width of each column - :param header_row: tuple or list containing the header row text - :param style: style for the table, a TableStyle object - :param outfile: output file (sys.stdout by default to print on the console) - :param encoding_in: encoding used when the input text is bytes (UTF-8 by default) - :param encoding_out: encoding used for the output (UTF-8 by default) - ''' - self.column_width = column_width - self.num_columns = len(column_width) - self.header_row = header_row - self.encoding_in = encoding_in - self.encoding_out = encoding_out - assert (header_row is None) or len(header_row) == self.num_columns - self.style = style - self.outfile = outfile - if header_row is not None: - self.write_header() - elif self.style.noheader_top: - self.write_noheader_top() - - - def write(self, s): - """ - shortcut for self.outfile.write() - """ - self.outfile.write(s) - - def write_row(self, row, last=False, colors=None): - assert len(row) == self.num_columns - columns = [] - max_lines = 0 - for i in xrange(self.num_columns): - cell = row[i] - # Convert to string: - cell = to_ustr(cell, encoding=self.encoding_in) - # Wrap cell text according to the column width - # TODO: use a TextWrapper object for each column instead - # split the string if it contains newline characters, otherwise - # textwrap replaces them with spaces: - column = [] - for line in cell.splitlines(): - column.extend(textwrap.wrap(line, width=self.column_width[i])) - # apply colors to each line of the cell if needed: - if colors is not None and self.outfile.isatty(): - color = colors[i] - if color: - for j in xrange(len(column)): - # print '%r: %s' % (column[j], type(column[j])) - column[j] = colorclass.Color(u'{auto%s}%s{/%s}' % (color, column[j], color)) - columns.append(column) - # determine which column has the highest number of lines - max_lines = max(len(columns[i]), max_lines) - # transpose: write output line by line - for j in xrange(max_lines): - self.write(self.style.vertical_left) - for i in xrange(self.num_columns): - column = columns[i] - if j file_name is not a glob - --> file?name is a glob - --> file* is a glob - --> file[-._]name is a glob - --> file[?]name is not a glob (matches literal "file?name") - --> file[*]name is not a glob (matches literal "file*name") - --> file[-]name is not a glob (matches literal "file-name") - --> file-name is not a glob - - Also, obviously incorrect globs are treated as non-globs - --> file[name is not a glob (matches literal "file[name") - --> file]-[name is treated as a glob - (it is not a valid glob but detecting errors like this requires - sophisticated regular expression matching) - - Python's glob also works with globs in directory-part of path - --> dir-part of path is analyzed just like filename-part - --> thirdparty/*/xglob.py is a (valid) glob - - TODO: create a correct regexp to test for validity of ranges - """ - - # remove escaped special chars - cleaned = filespec.replace('[*]', '').replace('[?]', '') \ - .replace('[[]', '').replace('[]]', '').replace('[-]', '') - - # check if special chars remain - return '*' in cleaned or '?' in cleaned or \ - ('[' in cleaned and ']' in cleaned) diff -Nru remnux-oletools-0.51a/oletools/thirdparty/xxxswf/LICENSE.txt remnux-oletools-0.51a/oletools/thirdparty/xxxswf/LICENSE.txt --- remnux-oletools-0.51a/oletools/thirdparty/xxxswf/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 +++ remnux-oletools-0.51a/oletools/thirdparty/xxxswf/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,3 +0,0 @@ -xxxswf.py is published by Alexander Hanel on -http://hooked-on-mnemonics.blogspot.nl/2011/12/xxxswfpy.html -without explicit license. \ No newline at end of file diff -Nru remnux-oletools-0.51a/oletools/thirdparty/xxxswf/xxxswf.py remnux-oletools-0.51a/oletools/thirdparty/xxxswf/xxxswf.py --- remnux-oletools-0.51a/oletools/thirdparty/xxxswf/xxxswf.py 2016-11-04 21:28:21.000000000 +0000 +++ remnux-oletools-0.51a/oletools/thirdparty/xxxswf/xxxswf.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,373 +0,0 @@ -# xxxswf.py was created by alexander dot hanel at gmail dot com -# version 0.1 -# Date - 12-07-2011 -# To do list -# - Tag Parser -# - ActionScript Decompiler - -# 2016-11-01 PL: - A few changes for Python 2+3 compatibility - -import fnmatch -import hashlib -import imp -import math -import os -import re -import struct -import sys -import time -from io import BytesIO -from optparse import OptionParser -import zlib - -def checkMD5(md5): -# checks if MD5 has been seen in MD5 Dictionary -# MD5Dict contains the MD5 and the CVE -# For { 'MD5':'CVE', 'MD5-1':'CVE-1', 'MD5-2':'CVE-2'} - MD5Dict = {'c46299a5015c6d31ad5766cb49e4ab4b':'CVE-XXXX-XXXX'} - if MD5Dict.get(md5): - print('\t[BAD] MD5 Match on', MD5Dict.get(md5)) - return - -def bad(f): - for idx, x in enumerate(findSWF(f)): - tmp = verifySWF(f,x) - if tmp != None: - yaraScan(tmp) - checkMD5(hashBuff(tmp)) - return - -def yaraScan(d): -# d = buffer of the read file -# Scans SWF using Yara - # test if yara module is installed - # if not Yara can be downloaded from http://code.google.com/p/yara-project/ - try: - imp.find_module('yara') - import yara - except ImportError: - print('\t[ERROR] Yara module not installed - aborting scan') - return - # test for yara compile errors - try: - r = yara.compile(r'rules.yar') - except: - pass - print('\t[ERROR] Yara compile error - aborting scan') - return - # get matches - m = r.match(data=d) - # print matches - for X in m: - print('\t[BAD] Yara Signature Hit: %s' % X) - return - -def findSWF(d): -# d = buffer of the read file -# Search for SWF Header Sigs in files - return [tmp.start() for tmp in re.finditer(b'CWS|FWS', d.read())] - -def hashBuff(d): -# d = buffer of the read file -# This function hashes the buffer -# source: http://stackoverflow.com/q/5853830 - if type(d) is str: - d = BytesIO(d) - md5 = hashlib.md5() - while True: - data = d.read(128) - if not data: - break - md5.update(data) - return md5.hexdigest() - -def verifySWF(f,addr): - # Start of SWF - f.seek(addr) - # Read Header - header = f.read(3) - # Read Version - ver = struct.unpack(' 20: - print(' - [ERROR] Invalid SWF Version') - return None - - if b'CWS' in header: - try: - f.read(3) - tmp = b'FWS' + f.read(5) + zlib.decompress(f.read()) - print(' - CWS Header') - return tmp - - except: - pass - print('- [ERROR]: Zlib decompression error. Invalid CWS SWF') - return None - - elif b'FWS' in header: - try: - tmp = f.read(size) - print(' - FWS Header') - return tmp - - except: - pass - print(' - [ERROR] Invalid SWF Size') - return None - - else: - print(' - [Error] Logic Error Blame Programmer') - return None - -def headerInfo(f): -# f is the already opended file handle -# Yes, the format is is a rip off SWFDump. Can you blame me? Their tool is awesome. - # SWFDump FORMAT - # [HEADER] File version: 8 - # [HEADER] File is zlib compressed. Ratio: 52% - # [HEADER] File size: 37536 - # [HEADER] Frame rate: 18.000000 - # [HEADER] Frame count: 323 - # [HEADER] Movie width: 217.00 - # [HEADER] Movie height: 85.00 - if type(f) is str: - f = BytesIO(f) - sig = f.read(3) - print('\t[HEADER] File header: %s' % sig) - if b'C' in sig: - print('\t[HEADER] File is zlib compressed.') - version = struct.unpack('> 3 - print('\t[HEADER] Rect Nbit: %d' % nbit) - # Curretely the nbit is static at 15. This could be modified in the - # future. If larger than 9 this will break the struct unpack. Will have - # to revist must be a more effective way to deal with bits. Tried to keep - # the algo but damn this is ugly... - f.seek(ta) - rect = struct.unpack('>Q', f.read(int(math.ceil((nbit*4)/8.0))))[0] - tmp = struct.unpack('>7)[2:].zfill(1) - # bin requires Python 2.6 or higher - # skips string '0b' and the nbit - rect = bin(rect)[7:] - xmin = int(rect[0:nbit-1],2) - print('\t[HEADER] Rect Xmin: %d' % xmin) - xmax = int(rect[nbit:(nbit*2)-1],2) - print('\t[HEADER] Rect Xmax: %d' % xmax) - ymin = int(rect[nbit*2:(nbit*3)-1],2) - print('\t[HEADER] Rect Ymin: %d' % ymin) - # one bit needs to be added, my math might be off here - ymax = int(rect[nbit*3:(nbit*4)-1] + str(tmp) ,2) - print('\t[HEADER] Rect Ymax: %d' % ymax) - framerate = struct.unpack(' + + + + + + + + + +

How to Suggest Improvements, Report Issues or Contribute

This is a personal open-source project, developed on my spare time. Any contribution, suggestion, feedback or bug report is welcome.

To suggest improvements, report a bug or any issue, please use the issue reporting page, and provide all the information and files to reproduce the problem.

You may also contact the author directly to send feedback.

The code is available in a repository on GitHub. You may use it to submit enhancements using forks and pull requests.

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/Contribute.md remnux-oletools-0.51a/remnux-oletools/doc/Contribute.md --- remnux-oletools-0.51a/remnux-oletools/doc/Contribute.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/Contribute.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,37 @@ +How to Suggest Improvements, Report Issues or Contribute +======================================================== + +This is a personal open-source project, developed on my spare time. +Any contribution, suggestion, feedback or bug report is welcome. + +To **suggest improvements, report a bug or any issue**, +please use the [issue reporting page](https://github.com/decalage2/oletools/issues), +and provide all the information and files to reproduce the problem. + +You may also [contact the author](http://decalage.info/contact) directly +to **send feedback**. + +The code is available in [a repository on GitHub](https://github.com/decalage2/oletools). +You may use it to **submit enhancements** using forks and pull requests. + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/Home.html remnux-oletools-0.51a/remnux-oletools/doc/Home.html --- remnux-oletools-0.51a/remnux-oletools/doc/Home.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/Home.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,54 @@ + + + + + + + + + + +

python-oletools v0.50 documentation

This is the home page of the documentation for python-oletools. The latest version can be found online, otherwise a copy is provided in the doc subfolder of the package.

python-oletools is a package of python tools to analyze Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), such as Microsoft Office documents or Outlook messages, mainly for malware analysis, forensics and debugging. It is based on the olefile parser. See http://www.decalage.info/python/oletools for more info.

Quick links: Home page - Download/Install - Documentation - Report Issues/Suggestions/Questions - Contact the Author - Repository - Updates on Twitter

Note: python-oletools is not related to OLETools published by BeCubed Software.

Tools in python-oletools:

olebrowse: A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to view and extract individual data streams.
oleid: to analyze OLE files to detect specific characteristics usually found in malicious files.
olemeta: to extract all standard properties (metadata) from OLE files.
oletimes: to extract creation and modification timestamps of all streams and storages.
oledir: to display all the directory entries of an OLE file, including free and orphaned entries.
olemap: to display a map of all the sectors in an OLE file.
olevba: to extract and analyze VBA Macro source code from MS Office documents (OLE and OpenXML).
mraptor: to detect malicious VBA Macros
pyxswf: to detect, extract and analyze Flash objects (SWF) that may be embedded in files such as MS Office documents (e.g. Word, Excel) and RTF, which is especially useful for malware analysis.
oleobj: to extract embedded objects from OLE files.
rtfobj: to extract embedded objects from RTF files.
and a few others (coming soon)

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/Home.md remnux-oletools-0.51a/remnux-oletools/doc/Home.md --- remnux-oletools-0.51a/remnux-oletools/doc/Home.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/Home.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,64 @@ +python-oletools v0.50 documentation +=================================== + +This is the home page of the documentation for python-oletools. The latest version can be found +[online](https://github.com/decalage2/oletools/wiki), otherwise a copy is provided in the doc subfolder of the package. + +[python-oletools](http://www.decalage.info/python/oletools) is a package of python tools to analyze +[Microsoft OLE2 files](http://en.wikipedia.org/wiki/Compound_File_Binary_Format) +(also called Structured Storage, Compound File Binary Format or Compound Document File Format), +such as Microsoft Office documents or Outlook messages, mainly for malware analysis, forensics and debugging. +It is based on the [olefile](http://www.decalage.info/olefile) parser. +See [http://www.decalage.info/python/oletools](http://www.decalage.info/python/oletools) for more info. + +**Quick links:** +[Home page](http://www.decalage.info/python/oletools) - +[Download/Install](https://github.com/decalage2/oletools/wiki/Install) - +[Documentation](https://github.com/decalage2/oletools/wiki) - +[Report Issues/Suggestions/Questions](https://github.com/decalage2/oletools/issues) - +[Contact the Author](http://decalage.info/contact) - +[Repository](https://github.com/decalage2/oletools) - +[Updates on Twitter](https://twitter.com/decalage2) + +Note: python-oletools is not related to OLETools published by BeCubed Software. + +Tools in python-oletools: +------------------------- + +- **[[olebrowse]]**: A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to + view and extract individual data streams. +- **[[oleid]]**: to analyze OLE files to detect specific characteristics usually found in malicious files. +- **[[olemeta]]**: to extract all standard properties (metadata) from OLE files. +- **[[oletimes]]**: to extract creation and modification timestamps of all streams and storages. +- **[[oledir]]**: to display all the directory entries of an OLE file, including free and orphaned entries. +- **[[olemap]]**: to display a map of all the sectors in an OLE file. +- **[[olevba]]**: to extract and analyze VBA Macro source code from MS Office documents (OLE and OpenXML). +- **[[mraptor]]**: to detect malicious VBA Macros +- **[[pyxswf]]**: to detect, extract and analyze Flash objects (SWF) that may + be embedded in files such as MS Office documents (e.g. Word, Excel) and RTF, + which is especially useful for malware analysis. +- **[[oleobj]]**: to extract embedded objects from OLE files. +- **[[rtfobj]]**: to extract embedded objects from RTF files. +- and a few others (coming soon) + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/Install.html remnux-oletools-0.51a/remnux-oletools/doc/Install.html --- remnux-oletools-0.51a/remnux-oletools/doc/Install.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/Install.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,69 @@ + + + + + + + + + + +

How to Download and Install python-oletools

Pre-requisites

The recommended Python version to run oletools is Python 2.7. Python 2.6 is also supported, but as it is not tested as often as 2.7, some features might not work as expected.

Since oletools v0.50, thanks to contributions by [@Sebdraven](https://twitter.com/Sebdraven), most tools can also run with Python 3.x. As this is quite new, please report any issue you may encounter.

Recommended way to Download+Install/Update oletools: pip

Pip is included with Python since version 2.7.9 and 3.4. If it is not installed on your system, either upgrade Python or see https://pip.pypa.io/en/stable/installing/

Linux, Mac OSX, Unix

To download and install/update the latest release version of oletools, run the following command in a shell:

sudo -H pip install -U oletools

Important: Since version 0.50, pip will automatically create convenient command-line scripts in /usr/local/bin to run all the oletools from any directory.

Windows

To download and install/update the latest release version of oletools, run the following command in a cmd window:

pip install -U oletools

Important: Since version 0.50, pip will automatically create convenient command-line scripts to run all the oletools from any directory: olevba, mraptor, oleid, rtfobj, etc.

How to install the latest development version

If you want to benefit from the latest improvements in the development version, you may also use pip:

Linux, Mac OSX, Unix

sudo -H pip install -U https://github.com/decalage2/oletools/archive/master.zip

Windows

pip install -U https://github.com/decalage2/oletools/archive/master.zip

How to install offline - Computer without Internet access

First, download the oletools archive on a computer with Internet access: * Latest stable version: from https://github.com/decalage2/oletools/releases * Development version: https://github.com/decalage2/oletools/archive/master.zip

Copy the archive file to the target computer.

On Linux, Mac OSX, Unix, run the following command using the filename of the archive that you downloaded:

sudo -H pip install -U oletools.zip

On Windows:

pip install -U oletools.zip

Old school install using setup.py

If you cannot use pip, it is still possible to run the setup.py script directly. However, this method will not create the command-line scripts automatically.

First, download the oletools archive: * Latest stable version: from https://github.com/decalage2/oletools/releases * Development version: https://github.com/decalage2/oletools/archive/master.zip

Then extract the archive, open a shell and go to the oletools directory.

Linux, Mac OSX, Unix

sudo -H python setup.py install

Windows:

python setup.py install

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/Install.md remnux-oletools-0.51a/remnux-oletools/doc/Install.md --- remnux-oletools-0.51a/remnux-oletools/doc/Install.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/Install.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,135 @@ +How to Download and Install python-oletools +=========================================== + +Pre-requisites +-------------- + +The recommended Python version to run oletools is **Python 2.7**. +Python 2.6 is also supported, but as it is not tested as often as 2.7, some features +might not work as expected. + +Since oletools v0.50, thanks to contributions by [@Sebdraven](https://twitter.com/Sebdraven), +most tools can also run with **Python 3.x**. As this is quite new, please +[report any issue]((https://github.com/decalage2/oletools/issues)) you may encounter. + + + +Recommended way to Download+Install/Update oletools: pip +-------------------------------------------------------- + +Pip is included with Python since version 2.7.9 and 3.4. If it is not installed on your +system, either upgrade Python or see https://pip.pypa.io/en/stable/installing/ + +### Linux, Mac OSX, Unix + +To download and install/update the latest release version of oletools, +run the following command in a shell: + +```text +sudo -H pip install -U oletools +``` + +**Important**: Since version 0.50, pip will automatically create convenient command-line scripts +in /usr/local/bin to run all the oletools from any directory. + +### Windows + +To download and install/update the latest release version of oletools, +run the following command in a cmd window: + +```text +pip install -U oletools +``` + +**Important**: Since version 0.50, pip will automatically create convenient command-line scripts +to run all the oletools from any directory: olevba, mraptor, oleid, rtfobj, etc. + + +How to install the latest development version +--------------------------------------------- + +If you want to benefit from the latest improvements in the development version, +you may also use pip: + +### Linux, Mac OSX, Unix + +```text +sudo -H pip install -U https://github.com/decalage2/oletools/archive/master.zip +``` + +### Windows + +```text +pip install -U https://github.com/decalage2/oletools/archive/master.zip +``` + +How to install offline - Computer without Internet access +--------------------------------------------------------- + +First, download the oletools archive on a computer with Internet access: +* Latest stable version: from https://github.com/decalage2/oletools/releases +* Development version: https://github.com/decalage2/oletools/archive/master.zip + +Copy the archive file to the target computer. + +On Linux, Mac OSX, Unix, run the following command using the filename of the +archive that you downloaded: + +```text +sudo -H pip install -U oletools.zip +``` + +On Windows: + +```text +pip install -U oletools.zip +``` + + +Old school install using setup.py +--------------------------------- + +If you cannot use pip, it is still possible to run the setup.py script +directly. However, this method will not create the command-line scripts +automatically. + +First, download the oletools archive: +* Latest stable version: from https://github.com/decalage2/oletools/releases +* Development version: https://github.com/decalage2/oletools/archive/master.zip + +Then extract the archive, open a shell and go to the oletools directory. + +### Linux, Mac OSX, Unix + +```text +sudo -H python setup.py install +``` + +### Windows: + +```text +python setup.py install +``` + + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/License.html remnux-oletools-0.51a/remnux-oletools/doc/License.html --- remnux-oletools-0.51a/remnux-oletools/doc/License.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/License.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,56 @@ + + + + + + + + + + +

License for python-oletools

This license applies to the python-oletools package, apart from the thirdparty folder which contains third-party files published with their own license.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ + + + + + +

License for officeparser

olevba contains modified source code from the officeparser project, published under the following MIT License (MIT):

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/License.md remnux-oletools-0.51a/remnux-oletools/doc/License.md --- remnux-oletools-0.51a/remnux-oletools/doc/License.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/License.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,79 @@ +License for python-oletools +=========================== + +This license applies to the [python-oletools](http://www.decalage.info/python/oletools) package, apart from the +thirdparty folder which contains third-party files published with their own license. + +The python-oletools package is copyright (c) 2012-2016 Philippe Lagadec ([http://www.decalage.info](http://www.decalage.info)) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +---------- +License for officeparser +------------------------ + +olevba contains modified source code from the [officeparser](https://github.com/unixfreak0037/officeparser) project, published +under the following MIT License (MIT): + +officeparser is copyright (c) 2014 John William Davison + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/mraptor1.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/mraptor1.png differ diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/mraptor.html remnux-oletools-0.51a/remnux-oletools/doc/mraptor.html --- remnux-oletools-0.51a/remnux-oletools/doc/mraptor.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/mraptor.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,81 @@ + + + + + + + + + + +

mraptor (MacroRaptor)

mraptor is a tool designed to detect most malicious VBA Macros using generic heuristics. Unlike antivirus engines, it does not rely on signatures.

In a nutshell, mraptor detects keywords corresponding to the three following types of behaviour that are present in clear text in almost any macro malware: - A: Auto-execution trigger - W: Write to the file system or memory - X: Execute a file or any payload outside the VBA context

mraptor considers that a macro is suspicious when A and (W or X) is true.

For more information about mraptor's detection algorithm, see the article How to detect most malicious macros without an antivirus.

mraptor can be used either as a command-line tool, or as a python module from your own applications.

It is part of the python-oletools package.

Usage

Usage: mraptor.py [options] <filename> [filename2 ...]
+
+Options:
+  -h, --help            show this help message and exit
+  -r                    find files recursively in subdirectories.
+  -z ZIP_PASSWORD, --zip=ZIP_PASSWORD
+                        if the file is a zip archive, open all files from it,
+                        using the provided password (requires Python 2.6+)
+  -f ZIP_FNAME, --zipfname=ZIP_FNAME
+                        if the file is a zip archive, file(s) to be opened
+                        within the zip. Wildcards * and ? are supported.
+                        (default:*)
+  -l LOGLEVEL, --loglevel=LOGLEVEL
+                        logging level debug/info/warning/error/critical
+                        (default=warning)
+  -m, --matches         Show matched strings.
+
+An exit code is returned based on the analysis result:
+ - 0: No Macro
+ - 1: Not MS Office
+ - 2: Macro OK
+ - 10: ERROR
+ - 20: SUSPICIOUS

Examples

Scan a single file:

mraptor.py file.doc

Scan a single file, stored in a Zip archive with password "infected":

mraptor.py malicious_file.xls.zip -z infected

Scan a collection of files stored in a folder:

mraptor.py "MalwareZoo/VBA/*"

Important: on Linux/MacOSX, always add double quotes around a file name when you use wildcards such as * and ?. Otherwise, the shell may replace the argument with the actual list of files matching the wildcards before starting the script.

Python 3 support - mraptor3

As of v0.50, mraptor has been ported to Python 3 thanks to @sebdraven. However, the differences between Python 2 and 3 are significant and for now there is a separate version of mraptor named mraptor3 to be used with Python 3.

How to use mraptor in Python applications

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/mraptor.md remnux-oletools-0.51a/remnux-oletools/doc/mraptor.md --- remnux-oletools-0.51a/remnux-oletools/doc/mraptor.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/mraptor.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,113 @@ +mraptor (MacroRaptor) +===================== + +mraptor is a tool designed to detect most malicious VBA Macros using +generic heuristics. Unlike antivirus engines, it does not rely on signatures. + +In a nutshell, mraptor detects keywords corresponding to the three +following types of behaviour that are present in clear text in almost +any macro malware: +- A: Auto-execution trigger +- W: Write to the file system or memory +- X: Execute a file or any payload outside the VBA context + +mraptor considers that a macro is suspicious when A and (W or X) is true. + +For more information about mraptor's detection algorithm, see the article +[How to detect most malicious macros without an antivirus](http://www.decalage.info/mraptor). + +mraptor can be used either as a command-line tool, or as a python module +from your own applications. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Usage + +```text +Usage: mraptor.py [options] [filename2 ...] + +Options: + -h, --help show this help message and exit + -r find files recursively in subdirectories. + -z ZIP_PASSWORD, --zip=ZIP_PASSWORD + if the file is a zip archive, open all files from it, + using the provided password (requires Python 2.6+) + -f ZIP_FNAME, --zipfname=ZIP_FNAME + if the file is a zip archive, file(s) to be opened + within the zip. Wildcards * and ? are supported. + (default:*) + -l LOGLEVEL, --loglevel=LOGLEVEL + logging level debug/info/warning/error/critical + (default=warning) + -m, --matches Show matched strings. + +An exit code is returned based on the analysis result: + - 0: No Macro + - 1: Not MS Office + - 2: Macro OK + - 10: ERROR + - 20: SUSPICIOUS +``` + +### Examples + +Scan a single file: + +```text +mraptor.py file.doc +``` + +Scan a single file, stored in a Zip archive with password "infected": + +```text +mraptor.py malicious_file.xls.zip -z infected +``` + +Scan a collection of files stored in a folder: + +```text +mraptor.py "MalwareZoo/VBA/*" +``` + +**Important**: on Linux/MacOSX, always add double quotes around a file name when you use +wildcards such as `*` and `?`. Otherwise, the shell may replace the argument with the actual +list of files matching the wildcards before starting the script. + +![](mraptor1.png) + +## Python 3 support - mraptor3 + +As of v0.50, mraptor has been ported to Python 3 thanks to @sebdraven. +However, the differences between Python 2 and 3 are significant and for now +there is a separate version of mraptor named mraptor3 to be used with +Python 3. + + +-------------------------------------------------------------------------- + +## How to use mraptor in Python applications + +TODO + + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/olebrowse1_menu.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/olebrowse1_menu.png differ Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/olebrowse2_stream.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/olebrowse2_stream.png differ Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/olebrowse3_hexview.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/olebrowse3_hexview.png differ diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.html remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.html --- remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,59 @@ + + + + + + + + + + +

olebrowse

olebrowse is a simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to view and extract individual data streams.

It is part of the python-oletools package.

Dependencies

olebrowse requires Tkinter. On Windows and MacOSX, it should be installed with Python, and olebrowse should work out of the box.

However, on Linux it might be necessary to install the tkinter package for Python separately. For example, on Ubuntu this is done with the following command:

sudo apt-get install python-tk

And for Python 3:

sudo apt-get install python3-tk

Usage

olebrowse.py [file]

If you provide a file it will be opened, else a dialog will allow you to browse folders to open a file. Then if it is a valid OLE file, the list of data streams will be displayed. You can select a stream, and then either view its content in a builtin hexadecimal viewer, or save it to a file for further analysis.

Screenshots

Main menu, showing all streams in the OLE file:

Menu with actions for a stream:

Hex view for a stream:

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.md remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.md --- remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olebrowse.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,76 @@ +olebrowse +========= + +olebrowse is a simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to +view and extract individual data streams. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +Dependencies +------------ + +olebrowse requires [Tkinter](https://en.wikipedia.org/wiki/Tkinter). +On Windows and MacOSX, it should be installed with Python, and +olebrowse should work out of the box. + +However, on Linux it might be necessary to install the tkinter +package for Python separately. For example, on Ubuntu this is done with the +following command: + +``` +sudo apt-get install python-tk +``` + +And for Python 3: + +``` +sudo apt-get install python3-tk +``` + + +Usage +----- + + olebrowse.py [file] + +If you provide a file it will be opened, else a dialog will allow you to browse +folders to open a file. Then if it is a valid OLE file, the list of data streams +will be displayed. You can select a stream, and then either view its content +in a builtin hexadecimal viewer, or save it to a file for further analysis. + +Screenshots +----------- + +Main menu, showing all streams in the OLE file: + +![](olebrowse1_menu.png) + +Menu with actions for a stream: + +![](olebrowse2_stream.png) + +Hex view for a stream: + +![](olebrowse3_hexview.png) + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oledir.html remnux-oletools-0.51a/remnux-oletools/doc/oledir.html --- remnux-oletools-0.51a/remnux-oletools/doc/oledir.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oledir.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,49 @@ + + + + + + + + + + +

oledir

oledir is a script to display all the directory entries of an OLE file, including free and orphaned entries.

It can be used either as a command-line tool, or as a python module from your own applications.

It is part of the python-oletools package.

Usage

Usage: oledir.py <filename>

Examples

Scan a single file:

oledir.py file.doc

How to use oledir in Python applications

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oledir.md remnux-oletools-0.51a/remnux-oletools/doc/oledir.md --- remnux-oletools-0.51a/remnux-oletools/doc/oledir.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oledir.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,54 @@ +oledir +====== + +oledir is a script to display all the directory entries of an OLE file, +including free and orphaned entries. + +It can be used either as a command-line tool, or as a python module from your own applications. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Usage + +```text +Usage: oledir.py +``` + +### Examples + +Scan a single file: + +```text +oledir.py file.doc +``` + +![](oledir.png) + + +-------------------------------------------------------------------------- + +## How to use oledir in Python applications + +TODO + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/oledir.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/oledir.png differ diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oleid.html remnux-oletools-0.51a/remnux-oletools/doc/oleid.html --- remnux-oletools-0.51a/remnux-oletools/doc/oleid.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oleid.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,122 @@ + + + + + + + + + + + +

oleid

oleid is a script to analyze OLE files such as MS Office documents (e.g. Word, Excel), to detect specific characteristics usually found in malicious files (e.g. malware). For example it can detect VBA macros and embedded Flash objects.

It is part of the python-oletools package.

Main Features

Detect OLE file type from its internal structure (e.g. MS Word, Excel, PowerPoint, ...)
Detect VBA Macros
Detect embedded Flash objects
Detect embedded OLE objects
Detect MS Office encryption
Can be used as a command-line tool
Python API to integrate it in your applications

Planned improvements:

Extract the most important metadata fields
Support for OpenXML files and embedded OLE files
Generic VBA macros detection
Detect auto-executable VBA macros
Extended OLE file types detection
Detect unusual OLE structures (fragmentation, unused sectors, etc)
Options to scan multiple files
Options to scan files from encrypted zip archives
CSV output

Usage

oleid.py <file>

Example

Analyzing a Word document containing a Flash object and VBA macros:

C:\oletools>oleid.py word_flash_vba.doc
+
+Filename: word_flash_vba.doc
++-------------------------------+-----------------------+
+| Indicator                     | Value                 |
++-------------------------------+-----------------------+
+| OLE format                    | True                  |
+| Has SummaryInformation stream | True                  |
+| Application name              | Microsoft Office Word |
+| Encrypted                     | False                 |
+| Word Document                 | True                  |
+| VBA Macros                    | True                  |
+| Excel Workbook                | False                 |
+| PowerPoint Presentation       | False                 |
+| Visio Drawing                 | False                 |
+| ObjectPool                    | True                  |
+| Flash objects                 | 1                     |
++-------------------------------+-----------------------+

How to use oleid in your Python applications

First, import oletools.oleid, and create an OleID object to scan a file:

import oletools.oleid
+
+oid = oletools.oleid.OleID(filename)

Note: filename can be a filename, a file-like object, or a bytes string containing the file to be analyzed.

Second, call the check() method. It returns a list of Indicator objects.

Each Indicator object has the following attributes:

id: str, identifier for the indicator
name: str, name to display the indicator
description: str, long description of the indicator
type: class of the indicator (e.g. bool, str, int)
value: value of the indicator

For example, the following code displays all the indicators:

indicators = oid.check()
+for i in indicators:
+    print 'Indicator id=%s name="%s" type=%s value=%s' % (i.id, i.name, i.type, repr(i.value))
+    print 'description:', i.description
+    print ''

See the source code of oleid.py for more details.

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oleid.md remnux-oletools-0.51a/remnux-oletools/doc/oleid.md --- remnux-oletools-0.51a/remnux-oletools/doc/oleid.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oleid.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,117 @@ +oleid +===== + +oleid is a script to analyze OLE files such as MS Office documents (e.g. Word, +Excel), to detect specific characteristics usually found in malicious files (e.g. malware). +For example it can detect VBA macros and embedded Flash objects. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Main Features + +- Detect OLE file type from its internal structure (e.g. MS Word, Excel, PowerPoint, ...) +- Detect VBA Macros +- Detect embedded Flash objects +- Detect embedded OLE objects +- Detect MS Office encryption +- Can be used as a command-line tool +- Python API to integrate it in your applications + +Planned improvements: + +- Extract the most important metadata fields +- Support for OpenXML files and embedded OLE files +- Generic VBA macros detection +- Detect auto-executable VBA macros +- Extended OLE file types detection +- Detect unusual OLE structures (fragmentation, unused sectors, etc) +- Options to scan multiple files +- Options to scan files from encrypted zip archives +- CSV output + +## Usage + +```text +oleid.py +``` + +### Example + +Analyzing a Word document containing a Flash object and VBA macros: + +```text +C:\oletools>oleid.py word_flash_vba.doc + +Filename: word_flash_vba.doc ++-------------------------------+-----------------------+ +| Indicator | Value | ++-------------------------------+-----------------------+ +| OLE format | True | +| Has SummaryInformation stream | True | +| Application name | Microsoft Office Word | +| Encrypted | False | +| Word Document | True | +| VBA Macros | True | +| Excel Workbook | False | +| PowerPoint Presentation | False | +| Visio Drawing | False | +| ObjectPool | True | +| Flash objects | 1 | ++-------------------------------+-----------------------+ +``` + +## How to use oleid in your Python applications + +First, import oletools.oleid, and create an **OleID** object to scan a file: + +```python +import oletools.oleid + +oid = oletools.oleid.OleID(filename) +``` + +Note: filename can be a filename, a file-like object, or a bytes string containing the file to be analyzed. + +Second, call the **check()** method. It returns a list of **Indicator** objects. + +Each Indicator object has the following attributes: + +- **id**: str, identifier for the indicator +- **name**: str, name to display the indicator +- **description**: str, long description of the indicator +- **type**: class of the indicator (e.g. bool, str, int) +- **value**: value of the indicator + +For example, the following code displays all the indicators: + +```python +indicators = oid.check() +for i in indicators: + print 'Indicator id=%s name="%s" type=%s value=%s' % (i.id, i.name, i.type, repr(i.value)) + print 'description:', i.description + print '' +``` + +See the source code of oleid.py for more details. + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/olemap1.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/olemap1.png differ Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/olemap2.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/olemap2.png differ diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olemap.html remnux-oletools-0.51a/remnux-oletools/doc/olemap.html --- remnux-oletools-0.51a/remnux-oletools/doc/olemap.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olemap.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,52 @@ + + + + + + + + + + +

olemap

olemap is a script to display a map of all the sectors in an OLE file.

It can be used either as a command-line tool, or as a python module from your own applications.

It is part of the python-oletools package.

Usage

Usage: olemap.py <filename>

Examples

Scan a single file:

olemap.py file.doc

How to use olemap in Python applications

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olemap.md remnux-oletools-0.51a/remnux-oletools/doc/olemap.md --- remnux-oletools-0.51a/remnux-oletools/doc/olemap.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olemap.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,54 @@ +olemap +====== + +olemap is a script to display a map of all the sectors in an OLE file. + +It can be used either as a command-line tool, or as a python module from your own applications. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Usage + +```text +Usage: olemap.py +``` + +### Examples + +Scan a single file: + +```text +olemap.py file.doc +``` + +![](olemap1.png) + +![](olemap2.png) + +-------------------------------------------------------------------------- + +## How to use olemap in Python applications + +TODO + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/doc/olemeta1.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/doc/olemeta1.png differ diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olemeta.html remnux-oletools-0.51a/remnux-oletools/doc/olemeta.html --- remnux-oletools-0.51a/remnux-oletools/doc/olemeta.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olemeta.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,45 @@ + + + + + + + + + + +

olemeta

olemeta is a script to parse OLE files such as MS Office documents (e.g. Word, Excel), to extract all standard properties present in the OLE file.

It is part of the python-oletools package.

Usage

olemeta.py <file>

Example

How to use olemeta in Python applications

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olemeta.md remnux-oletools-0.51a/remnux-oletools/doc/olemeta.md --- remnux-oletools-0.51a/remnux-oletools/doc/olemeta.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olemeta.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,43 @@ +olemeta +======= + +olemeta is a script to parse OLE files such as MS Office documents (e.g. Word, +Excel), to extract all standard properties present in the OLE file. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Usage + +```text +olemeta.py +``` + +### Example + +![](olemeta1.png) + +## How to use olemeta in Python applications + +TODO + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oleobj.html remnux-oletools-0.51a/remnux-oletools/doc/oleobj.html --- remnux-oletools-0.51a/remnux-oletools/doc/oleobj.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oleobj.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,44 @@ + + + + + + + + + + +

oleobj

oleobj is a script to extract embedded objects from OLE files.

It can be used either as a command-line tool, or as a python module from your own applications.

It is part of the python-oletools package.

Usage

TODO

How to use oleobj in Python applications

See rtfobj.py source code.

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oleobj.md remnux-oletools-0.51a/remnux-oletools/doc/oleobj.md --- remnux-oletools-0.51a/remnux-oletools/doc/oleobj.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oleobj.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,44 @@ +oleobj +====== + +oleobj is a script to extract embedded objects from OLE files. + +It can be used either as a command-line tool, or as a python module from your own applications. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Usage + +```text +TODO +``` + +-------------------------------------------------------------------------- + +## How to use oleobj in Python applications + +See rtfobj.py source code. + +TODO + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/oletimes.html remnux-oletools-0.51a/remnux-oletools/doc/oletimes.html --- remnux-oletools-0.51a/remnux-oletools/doc/oletimes.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/oletimes.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,68 @@ + + + + + + + + + + +

oletimes

oletimes is a script to parse OLE files such as MS Office documents (e.g. Word, Excel), to extract creation and modification times of all streams and storages in the OLE file.

It is part of the python-oletools package.

Usage

oletimes.py <file>

Example

Checking the malware sample DIAN_caso-5415.doc:

>oletimes.py DIAN_caso-5415.doc
+
++----------------------------+---------------------+---------------------+
+| Stream/Storage name        | Modification Time   | Creation Time       |
++----------------------------+---------------------+---------------------+
+| Root                       | 2014-05-14 12:45:24 | None                |
+| '\x01CompObj'              | None                | None                |
+| '\x05DocumentSummaryInform | None                | None                |
+| ation'                     |                     |                     |
+| '\x05SummaryInformation'   | None                | None                |
+| '1Table'                   | None                | None                |
+| 'Data'                     | None                | None                |
+| 'Macros'                   | 2014-05-14 12:45:24 | 2014-05-14 12:45:24 |
+| 'Macros/PROJECT'           | None                | None                |
+| 'Macros/PROJECTwm'         | None                | None                |
+| 'Macros/VBA'               | 2014-05-14 12:45:24 | 2014-05-14 12:45:24 |
+| 'Macros/VBA/ThisDocument'  | None                | None                |
+| 'Macros/VBA/_VBA_PROJECT'  | None                | None                |
+| 'Macros/VBA/__SRP_0'       | None                | None                |
+| 'Macros/VBA/__SRP_1'       | None                | None                |
+| 'Macros/VBA/__SRP_2'       | None                | None                |
+| 'Macros/VBA/__SRP_3'       | None                | None                |
+| 'Macros/VBA/dir'           | None                | None                |
+| 'WordDocument'             | None                | None                |
++----------------------------+---------------------+---------------------+

How to use oletimes in Python applications

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

olevba

olevba is a script to parse OLE and OpenXML files such as MS Office documents (e.g. Word, Excel), to detect VBA Macros, extract their source code in clear text, and detect security-related patterns such as auto-executable macros, suspicious VBA keywords used by malware, anti-sandboxing and anti-virtualization techniques, and potential IOCs (IP addresses, URLs, executable filenames, etc). It also detects and decodes several common obfuscation methods including Hex encoding, StrReverse, Base64, Dridex, VBA expressions, and extracts IOCs from decoded strings.

It can be used either as a command-line tool, or as a python module from your own applications.

It is part of the python-oletools package.

olevba is based on source code from officeparser by John William Davison, with significant modifications.

Supported formats

Word 97-2003 (.doc, .dot)
Word 2007+ (.docm, .dotm)
Word 2003 XML (.xml)
Word/Excel MHTML, aka Single File Web Page (.mht)
Excel 97-2003 (.xls)
Excel 2007+ (.xlsm, .xlsb)
PowerPoint 2007+ (.pptm, .ppsm)
Text file containing VBA or VBScript source code
Password-protected Zip archive containing any of the above

Main Features

Detect VBA macros in MS Office 97-2003 and 2007+ files, XML, MHT
Extract VBA macro source code
Detect auto-executable macros
Detect suspicious VBA keywords often used by malware
Detect anti-sandboxing and anti-virtualization techniques
Detect and decodes strings obfuscated with Hex/Base64/StrReverse/Dridex
Deobfuscates VBA expressions with any combination of Chr, Asc, Val, StrReverse, Environ, +, &, using a VBA parser built with pyparsing, including custom Hex and Base64 encodings
Extract IOCs/patterns of interest such as IP addresses, URLs, e-mail addresses and executable file names
Scan multiple files and sample collections (wildcards, recursive)
Triage mode for a summary view of multiple files
Scan malware samples in password-protected Zip archives
Python API to use olevba from your applications

MS Office files encrypted with a password are also supported, because VBA macro code is never encrypted, only the content of the document.

About VBA Macros

See this article for more information and technical details about VBA Macros and how they are stored in MS Office documents.

How it works

olevba checks the file type: If it is an OLE file (i.e MS Office 97-2003), it is parsed right away.
If it is a zip file (i.e. MS Office 2007+), XML or MHTML, olevba looks for all OLE files stored in it (e.g. vbaProject.bin, editdata.mso), and opens them.
olevba identifies all the VBA projects stored in the OLE structure.
Each VBA project is parsed to find the corresponding OLE streams containing macro code.
In each of these OLE streams, the VBA macro source code is extracted and decompressed (RLE compression).
olevba looks for specific strings obfuscated with various algorithms (Hex, Base64, StrReverse, Dridex, VBA expressions).
olevba scans the macro source code and the deobfuscated strings to find suspicious keywords, auto-executable macros and potential IOCs (URLs, IP addresses, e-mail addresses, executable filenames, etc).

Usage

Usage: olevba.py [options] <filename> [filename2 ...]
+    
+Options:
+  -h, --help            show this help message and exit
+  -r                    find files recursively in subdirectories.
+  -z ZIP_PASSWORD, --zip=ZIP_PASSWORD
+                        if the file is a zip archive, open all files from it,
+                        using the provided password (requires Python 2.6+)
+  -f ZIP_FNAME, --zipfname=ZIP_FNAME
+                        if the file is a zip archive, file(s) to be opened
+                        within the zip. Wildcards * and ? are supported.
+                        (default:*)
+  -t, --triage          triage mode, display results as a summary table
+                        (default for multiple files)
+  -d, --detailed        detailed mode, display full results (default for
+                        single file)
+  -a, --analysis        display only analysis results, not the macro source
+                        code
+  -c, --code            display only VBA source code, do not analyze it
+  -i INPUT, --input=INPUT
+                        input file containing VBA source code to be analyzed
+                        (no parsing)
+  --decode              display all the obfuscated strings with their decoded
+                        content (Hex, Base64, StrReverse, Dridex, VBA).
+  --attr                display the attribute lines at the beginning of VBA
+                        source code
+  --reveal              display the macro source code after replacing all the
+                        obfuscated strings by their decoded content.

Examples

Scan a single file:

olevba.py file.doc

Scan a single file, stored in a Zip archive with password "infected":

olevba.py malicious_file.xls.zip -z infected

Scan a single file, showing all obfuscated strings decoded:

olevba.py file.doc --decode

Scan a single file, showing the macro source code with VBA strings deobfuscated:

olevba.py file.doc --reveal

Scan VBA source code extracted into a text file:

olevba.py source_code.vba

Scan a collection of files stored in a folder:

olevba.py "MalwareZoo/VBA/*"

NOTE: On Linux, MacOSX and other Unix variants, it is required to add double quotes around wildcards. Otherwise, they will be expanded by the shell instead of olevba.

Scan all .doc and .xls files, recursively in all subfolders:

olevba.py "MalwareZoo/VBA/*.doc" "MalwareZoo/VBA/*.xls" -r

Scan all .doc files within all .zip files with password, recursively:

olevba.py "MalwareZoo/VBA/*.zip" -r -z infected -f "*.doc"

Detailed analysis mode (default for single file)

When a single file is scanned, or when using the option -d, all details of the analysis are displayed.

For example, checking the malware sample DIAN_caso-5415.doc:

>olevba.py c:\MalwareZoo\VBA\DIAN_caso-5415.doc.zip -z infected
+===============================================================================
+FILE: DIAN_caso-5415.doc.malware in c:\MalwareZoo\VBA\DIAN_caso-5415.doc.zip
+Type: OLE
+-------------------------------------------------------------------------------
+VBA MACRO ThisDocument.cls
+in file: DIAN_caso-5415.doc.malware - OLE stream: Macros/VBA/ThisDocument
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+Option Explicit
+Private Declare Function URLDownloadToFileA Lib "urlmon" (ByVal FVQGKS As Long,_
+ByVal WSGSGY As String, ByVal IFRRFV As String, ByVal NCVOLV As Long, _
+ByVal HQTLDG As Long) As Long
+Sub AutoOpen()
+    Auto_Open
+End Sub
+Sub Auto_Open()
+SNVJYQ
+End Sub
+Public Sub SNVJYQ()
+    [Malicious Code...]
+End Sub
+Function OGEXYR(XSTAHU As String, PHHWIV As String) As Boolean
+    [Malicious Code...]
+    Application.DisplayAlerts = False
+    Application.Quit
+End Function
+Sub Workbook_Open()
+    Auto_Open
+End Sub
+
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ANALYSIS:
++------------+----------------------+-----------------------------------------+
+| Type       | Keyword              | Description                             |
++------------+----------------------+-----------------------------------------+
+| AutoExec   | AutoOpen             | Runs when the Word document is opened   |
+| AutoExec   | Auto_Open            | Runs when the Excel Workbook is opened  |
+| AutoExec   | Workbook_Open        | Runs when the Excel Workbook is opened  |
+| Suspicious | Lib                  | May run code from a DLL                 |
+| Suspicious | Shell                | May run an executable file or a system  |
+|            |                      | command                                 |
+| Suspicious | Environ              | May read system environment variables   |
+| Suspicious | URLDownloadToFileA   | May download files from the Internet    |
+| IOC        | http://germanya.com. | URL                                     |
+|            | ec/logs/test.exe"    |                                         |
+| IOC        | http://germanya.com. | URL                                     |
+|            | ec/logs/counter.php" |                                         |
+| IOC        | germanya.com         | Executable file name                    |
+| IOC        | test.exe             | Executable file name                    |
+| IOC        | sfjozjero.exe        | Executable file name                    |
++------------+----------------------+-----------------------------------------+

Triage mode (default for multiple files)

When several files are scanned, or when using the option -t, a summary of the analysis for each file is displayed. This is more convenient for quick triage of a collection of suspicious files.

The following flags show the results of the analysis:

OLE: the file type is OLE, for example MS Office 97-2003
OpX: the file type is OpenXML, for example MS Office 2007+
XML: the file type is Word 2003 XML
MHT: the file type is Word MHTML, aka Single File Web Page (.mht)
?: the file type is not supported
M: contains VBA Macros
A: auto-executable macros
S: suspicious VBA keywords
I: potential IOCs
H: hex-encoded strings (potential obfuscation)
B: Base64-encoded strings (potential obfuscation)
D: Dridex-encoded strings (potential obfuscation)
V: VBA string expressions (potential obfuscation)

Here is an example:

c:\>olevba.py \MalwareZoo\VBA\samples\*
+Flags       Filename
+----------- -----------------------------------------------------------------
+OLE:MASI--- \MalwareZoo\VBA\samples\DIAN_caso-5415.doc.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_1.doc.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_2.doc.malware
+OLE:MASI--- \MalwareZoo\VBA\samples\DRIDEX_3.doc.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_4.doc.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_5.doc.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_6.doc.malware
+OLE:MAS---- \MalwareZoo\VBA\samples\DRIDEX_7.doc.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_8.doc.malware
+OLE:MASIHBD \MalwareZoo\VBA\samples\DRIDEX_9.xls.malware
+OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_A.doc.malware
+OLE:------- \MalwareZoo\VBA\samples\Normal_Document.doc
+OLE:M------ \MalwareZoo\VBA\samples\Normal_Document_Macro.doc
+OpX:MASI--- \MalwareZoo\VBA\samples\RottenKitten.xlsb.malware
+OLE:MASI-B- \MalwareZoo\VBA\samples\ROVNIX.doc.malware
+OLE:MA----- \MalwareZoo\VBA\samples\Word within Word macro auto.doc

Python 3 support - olevba3

As of v0.50, olevba has been ported to Python 3 thanks to @sebdraven. However, the differences between Python 2 and 3 are significant and for now there is a separate version of olevba named olevba3 to be used with Python 3.

How to use olevba in Python applications

olevba may be used to open a MS Office file, detect if it contains VBA macros, extract and analyze the VBA source code from your own python applications.

IMPORTANT: olevba is currently under active development, therefore this API is likely to change.

Import olevba

First, import the oletools.olevba package, using at least the VBA_Parser and VBA_Scanner classes:

from oletools.olevba import VBA_Parser, TYPE_OLE, TYPE_OpenXML, TYPE_Word2003_XML, TYPE_MHTML

Parse a MS Office file - VBA_Parser

To parse a file on disk, create an instance of the VBA_Parser class, providing the name of the file to open as parameter. For example:

vbaparser = VBA_Parser('my_file_with_macros.doc')

The file may also be provided as a bytes string containing its data. In that case, the actual filename must be provided for reference, and the file content with the data parameter. For example:

myfile = 'my_file_with_macros.doc'
+filedata = open(myfile, 'rb').read()
+vbaparser = VBA_Parser(myfile, data=filedata)

VBA_Parser will raise an exception if the file is not a supported format, such as OLE (MS Office 97-2003), OpenXML (MS Office 2007+), MHTML or Word 2003 XML.

After parsing the file, the attribute VBA_Parser.type is a string indicating the file type. It can be either TYPE_OLE, TYPE_OpenXML, TYPE_Word2003_XML or TYPE_MHTML. (constants defined in the olevba module)

Detect VBA macros

The method detect_vba_macros of a VBA_Parser object returns True if VBA macros have been found in the file, False otherwise.

if vbaparser.detect_vba_macros():
+    print 'VBA Macros found'
+else:
+    print 'No VBA Macros found'

Note: The detection algorithm looks for streams and storage with specific names in the OLE structure, which works fine for all the supported formats listed above. However, for some formats such as PowerPoint 97-2003, this method will always return False because VBA Macros are stored in a different way which is not yet supported by olevba.

Moreover, if the file contains an embedded document (e.g. an Excel workbook inserted into a Word document), this method may return True if the embedded document contains VBA Macros, even if the main document does not.

Extract VBA Macro Source Code

The method extract_macros extracts and decompresses source code for each VBA macro found in the file (possibly including embedded files). It is a generator yielding a tuple (filename, stream_path, vba_filename, vba_code) for each VBA macro found.

filename: If the file is OLE (MS Office 97-2003), filename is the path of the file. If the file is OpenXML (MS Office 2007+), filename is the path of the OLE subfile containing VBA macros within the zip archive, e.g. word/vbaProject.bin.
stream_path: path of the OLE stream containing the VBA macro source code
vba_filename: corresponding VBA filename
vba_code: string containing the VBA source code in clear text

Example:

for (filename, stream_path, vba_filename, vba_code) in vbaparser.extract_macros():
+    print '-'*79
+    print 'Filename    :', filename
+    print 'OLE stream  :', stream_path
+    print 'VBA filename:', vba_filename
+    print '- '*39
+    print vba_code

Alternatively, the VBA_Parser method extract_all_macros returns the same results as a list of tuples.

Analyze VBA Source Code

Since version 0.40, the VBA_Parser class provides simpler methods than VBA_Scanner to analyze all macros contained in a file:

The method analyze_macros from the class VBA_Parser can be used to scan the source code of all VBA modules to find obfuscated strings, suspicious keywords, IOCs, auto-executable macros, etc.

analyze_macros() takes an optional argument show_decoded_strings: if set to True, the results will contain all the encoded strings found in the code (Hex, Base64, Dridex) with their decoded value. By default, it will only include the strings which contain printable characters.

VBA_Parser.analyze_macros() returns a list of tuples (type, keyword, description), one for each item in the results.

type may be either 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String', 'Dridex String' or 'VBA obfuscated Strings'.
keyword is the string found for auto-executable macros, suspicious keywords or IOCs. For obfuscated strings, it is the decoded value of the string.
description provides a description of the keyword. For obfuscated strings, it is the encoded value of the string.

Example:

results = vbaparser.analyze_macros()
+for kw_type, keyword, description in results:
+    print 'type=%s - keyword=%s - description=%s' % (kw_type, keyword, description)

After calling analyze_macros, the following VBA_Parser attributes also provide the number of items found for each category:

print 'AutoExec keywords: %d' % vbaparser.nb_autoexec
+print 'Suspicious keywords: %d' % vbaparser.nb_suspicious
+print 'IOCs: %d' % vbaparser.nb_iocs
+print 'Hex obfuscated strings: %d' % vbaparser.nb_hexstrings
+print 'Base64 obfuscated strings: %d' % vbaparser.nb_base64strings
+print 'Dridex obfuscated strings: %d' % vbaparser.nb_dridexstrings
+print 'VBA obfuscated strings: %d' % vbaparser.nb_vbastrings

Deobfuscate VBA Macro Source Code

The method reveal attempts to deobfuscate the macro source code by replacing all the obfuscated strings by their decoded content. Returns a single string.

Example:

print vbaparser.reveal()

Close the VBA_Parser

After usage, it is better to call the close method of the VBA_Parser object, to make sure the file is closed, especially if your application is parsing many files.

vbaparser.close()

Deprecated API

The following methods and functions are still functional, but their usage is not recommended since they have been replaced by better solutions.

VBA_Scanner (deprecated)

The class VBA_Scanner can be used to scan the source code of a VBA module to find obfuscated strings, suspicious keywords, IOCs, auto-executable macros, etc.

First, create a VBA_Scanner object with a string containing the VBA source code (for example returned by the extract_macros method). Then call the methods scan or scan_summary to get the results of the analysis.

scan() takes an optional argument include_decoded_strings: if set to True, the results will contain all the encoded strings found in the code (Hex, Base64, Dridex) with their decoded value.

scan returns a list of tuples (type, keyword, description), one for each item in the results.

type may be either 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String'.
keyword is the string found for auto-executable macros, suspicious keywords or IOCs. For obfuscated strings, it is the decoded value of the string.
description provides a description of the keyword. For obfuscated strings, it is the encoded value of the string.

Example:

vba_scanner = VBA_Scanner(vba_code)
+results = vba_scanner.scan(include_decoded_strings=True)
+for kw_type, keyword, description in results:
+    print 'type=%s - keyword=%s - description=%s' % (kw_type, keyword, description)

The function scan_vba is a shortcut for VBA_Scanner(vba_code).scan():

results = scan_vba(vba_code, include_decoded_strings=True)
+for kw_type, keyword, description in results:
+    print 'type=%s - keyword=%s - description=%s' % (kw_type, keyword, description)

scan_summary returns a tuple with the number of items found for each category: (autoexec, suspicious, IOCs, hex, base64, dridex).

Detect auto-executable macros (deprecated)

Deprecated: It is preferable to use either scan_vba or VBA_Scanner to get all results at once.

The function detect_autoexec checks if VBA macro code contains specific macro names that will be triggered when the document/workbook is opened, closed, changed, etc.

It returns a list of tuples containing two strings, the detected keyword, and the description of the trigger. (See the malware example above)

Sample usage:

from oletools.olevba import detect_autoexec
+autoexec_keywords = detect_autoexec(vba_code)
+if autoexec_keywords:
+    print 'Auto-executable macro keywords found:'
+    for keyword, description in autoexec_keywords:
+        print '%s: %s' % (keyword, description)
+else:
+    print 'Auto-executable macro keywords: None found'

Detect suspicious VBA keywords (deprecated)

Deprecated: It is preferable to use either scan_vba or VBA_Scanner to get all results at once.

The function detect_suspicious checks if VBA macro code contains specific keywords often used by malware to act on the system (create files, run commands or applications, write to the registry, etc).

It returns a list of tuples containing two strings, the detected keyword, and the description of the corresponding malicious behaviour. (See the malware example above)

Sample usage:

from oletools.olevba import detect_suspicious
+suspicious_keywords = detect_suspicious(vba_code)
+if suspicious_keywords:
+    print 'Suspicious VBA keywords found:'
+    for keyword, description in suspicious_keywords:
+        print '%s: %s' % (keyword, description)
+else:
+    print 'Suspicious VBA keywords: None found'

Extract potential IOCs (deprecated)

Deprecated: It is preferable to use either scan_vba or VBA_Scanner to get all results at once.

The function detect_patterns checks if VBA macro code contains specific patterns of interest, that may be useful for malware analysis and detection (potential Indicators of Compromise): IP addresses, e-mail addresses, URLs, executable file names.

It returns a list of tuples containing two strings, the pattern type, and the extracted value. (See the malware example above)

Sample usage:

from oletools.olevba import detect_patterns
+patterns = detect_patterns(vba_code)
+if patterns:
+    print 'Patterns found:'
+    for pattern_type, value in patterns:
+        print '%s: %s' % (pattern_type, value)
+else:
+    print 'Patterns: None found'

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/olevba.md remnux-oletools-0.51a/remnux-oletools/doc/olevba.md --- remnux-oletools-0.51a/remnux-oletools/doc/olevba.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/olevba.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,544 @@ +olevba +====== + +olevba is a script to parse OLE and OpenXML files such as MS Office documents +(e.g. Word, Excel), to **detect VBA Macros**, extract their **source code** in clear text, +and detect security-related patterns such as **auto-executable macros**, **suspicious +VBA keywords** used by malware, anti-sandboxing and anti-virtualization techniques, +and potential **IOCs** (IP addresses, URLs, executable filenames, etc). +It also detects and decodes several common **obfuscation methods including Hex encoding, +StrReverse, Base64, Dridex, VBA expressions**, and extracts IOCs from decoded strings. + +It can be used either as a command-line tool, or as a python module from your own applications. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +olevba is based on source code from [officeparser](https://github.com/unixfreak0037/officeparser) +by John William Davison, with significant modifications. + +## Supported formats + +- Word 97-2003 (.doc, .dot) +- Word 2007+ (.docm, .dotm) +- Word 2003 XML (.xml) +- Word/Excel MHTML, aka Single File Web Page (.mht) +- Excel 97-2003 (.xls) +- Excel 2007+ (.xlsm, .xlsb) +- PowerPoint 2007+ (.pptm, .ppsm) +- Text file containing VBA or VBScript source code +- Password-protected Zip archive containing any of the above + +## Main Features + +- Detect VBA macros in MS Office 97-2003 and 2007+ files, XML, MHT +- Extract VBA macro source code +- Detect auto-executable macros +- Detect suspicious VBA keywords often used by malware +- Detect anti-sandboxing and anti-virtualization techniques +- Detect and decodes strings obfuscated with Hex/Base64/StrReverse/Dridex +- Deobfuscates VBA expressions with any combination of Chr, Asc, Val, StrReverse, Environ, +, &, using a VBA parser built with +[pyparsing](http://pyparsing.wikispaces.com), including custom Hex and Base64 encodings +- Extract IOCs/patterns of interest such as IP addresses, URLs, e-mail addresses and executable file names +- Scan multiple files and sample collections (wildcards, recursive) +- Triage mode for a summary view of multiple files +- Scan malware samples in password-protected Zip archives +- Python API to use olevba from your applications + +MS Office files encrypted with a password are also supported, because VBA macro code is never +encrypted, only the content of the document. + +## About VBA Macros + +See [this article](http://www.decalage.info/en/vba_tools) for more information and technical details about VBA Macros +and how they are stored in MS Office documents. + +## How it works + +1. olevba checks the file type: If it is an OLE file (i.e MS Office 97-2003), it is parsed right away. +1. If it is a zip file (i.e. MS Office 2007+), XML or MHTML, olevba looks for all OLE files stored in it (e.g. vbaProject.bin, editdata.mso), and opens them. +1. olevba identifies all the VBA projects stored in the OLE structure. +1. Each VBA project is parsed to find the corresponding OLE streams containing macro code. +1. In each of these OLE streams, the VBA macro source code is extracted and decompressed (RLE compression). +1. olevba looks for specific strings obfuscated with various algorithms (Hex, Base64, StrReverse, Dridex, VBA expressions). +1. olevba scans the macro source code and the deobfuscated strings to find suspicious keywords, auto-executable macros +and potential IOCs (URLs, IP addresses, e-mail addresses, executable filenames, etc). + + +## Usage + +```text +Usage: olevba.py [options] [filename2 ...] + +Options: + -h, --help show this help message and exit + -r find files recursively in subdirectories. + -z ZIP_PASSWORD, --zip=ZIP_PASSWORD + if the file is a zip archive, open all files from it, + using the provided password (requires Python 2.6+) + -f ZIP_FNAME, --zipfname=ZIP_FNAME + if the file is a zip archive, file(s) to be opened + within the zip. Wildcards * and ? are supported. + (default:*) + -t, --triage triage mode, display results as a summary table + (default for multiple files) + -d, --detailed detailed mode, display full results (default for + single file) + -a, --analysis display only analysis results, not the macro source + code + -c, --code display only VBA source code, do not analyze it + -i INPUT, --input=INPUT + input file containing VBA source code to be analyzed + (no parsing) + --decode display all the obfuscated strings with their decoded + content (Hex, Base64, StrReverse, Dridex, VBA). + --attr display the attribute lines at the beginning of VBA + source code + --reveal display the macro source code after replacing all the + obfuscated strings by their decoded content. +``` + +### Examples + +Scan a single file: + +```text +olevba.py file.doc +``` + +Scan a single file, stored in a Zip archive with password "infected": + +```text +olevba.py malicious_file.xls.zip -z infected +``` + +Scan a single file, showing all obfuscated strings decoded: + +```text +olevba.py file.doc --decode +``` + +Scan a single file, showing the macro source code with VBA strings deobfuscated: + +```text +olevba.py file.doc --reveal +``` + +Scan VBA source code extracted into a text file: + +```text +olevba.py source_code.vba +``` + +Scan a collection of files stored in a folder: + +```text +olevba.py "MalwareZoo/VBA/*" +``` +NOTE: On Linux, MacOSX and other Unix variants, it is required to add double quotes around wildcards. Otherwise, they will be expanded by the shell instead of olevba. + +Scan all .doc and .xls files, recursively in all subfolders: + +```text +olevba.py "MalwareZoo/VBA/*.doc" "MalwareZoo/VBA/*.xls" -r +``` + +Scan all .doc files within all .zip files with password, recursively: + +```text +olevba.py "MalwareZoo/VBA/*.zip" -r -z infected -f "*.doc" +``` + + +### Detailed analysis mode (default for single file) + +When a single file is scanned, or when using the option -d, all details of the analysis are displayed. + +For example, checking the malware sample [DIAN_caso-5415.doc](https://malwr.com/analysis/M2I4YWRhM2IwY2QwNDljN2E3ZWFjYTg3ODk4NmZhYmE/): + +```text +>olevba.py c:\MalwareZoo\VBA\DIAN_caso-5415.doc.zip -z infected +=============================================================================== +FILE: DIAN_caso-5415.doc.malware in c:\MalwareZoo\VBA\DIAN_caso-5415.doc.zip +Type: OLE +------------------------------------------------------------------------------- +VBA MACRO ThisDocument.cls +in file: DIAN_caso-5415.doc.malware - OLE stream: Macros/VBA/ThisDocument +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Option Explicit +Private Declare Function URLDownloadToFileA Lib "urlmon" (ByVal FVQGKS As Long,_ +ByVal WSGSGY As String, ByVal IFRRFV As String, ByVal NCVOLV As Long, _ +ByVal HQTLDG As Long) As Long +Sub AutoOpen() + Auto_Open +End Sub +Sub Auto_Open() +SNVJYQ +End Sub +Public Sub SNVJYQ() + [Malicious Code...] +End Sub +Function OGEXYR(XSTAHU As String, PHHWIV As String) As Boolean + [Malicious Code...] + Application.DisplayAlerts = False + Application.Quit +End Function +Sub Workbook_Open() + Auto_Open +End Sub + +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +ANALYSIS: ++------------+----------------------+-----------------------------------------+ +| Type | Keyword | Description | ++------------+----------------------+-----------------------------------------+ +| AutoExec | AutoOpen | Runs when the Word document is opened | +| AutoExec | Auto_Open | Runs when the Excel Workbook is opened | +| AutoExec | Workbook_Open | Runs when the Excel Workbook is opened | +| Suspicious | Lib | May run code from a DLL | +| Suspicious | Shell | May run an executable file or a system | +| | | command | +| Suspicious | Environ | May read system environment variables | +| Suspicious | URLDownloadToFileA | May download files from the Internet | +| IOC | http://germanya.com. | URL | +| | ec/logs/test.exe" | | +| IOC | http://germanya.com. | URL | +| | ec/logs/counter.php" | | +| IOC | germanya.com | Executable file name | +| IOC | test.exe | Executable file name | +| IOC | sfjozjero.exe | Executable file name | ++------------+----------------------+-----------------------------------------+ +``` + +### Triage mode (default for multiple files) + +When several files are scanned, or when using the option -t, a summary of the analysis for each file is displayed. +This is more convenient for quick triage of a collection of suspicious files. + +The following flags show the results of the analysis: + +- **OLE**: the file type is OLE, for example MS Office 97-2003 +- **OpX**: the file type is OpenXML, for example MS Office 2007+ +- **XML**: the file type is Word 2003 XML +- **MHT**: the file type is Word MHTML, aka Single File Web Page (.mht) +- **?**: the file type is not supported +- **M**: contains VBA Macros +- **A**: auto-executable macros +- **S**: suspicious VBA keywords +- **I**: potential IOCs +- **H**: hex-encoded strings (potential obfuscation) +- **B**: Base64-encoded strings (potential obfuscation) +- **D**: Dridex-encoded strings (potential obfuscation) +- **V**: VBA string expressions (potential obfuscation) + +Here is an example: + +```text +c:\>olevba.py \MalwareZoo\VBA\samples\* +Flags Filename +----------- ----------------------------------------------------------------- +OLE:MASI--- \MalwareZoo\VBA\samples\DIAN_caso-5415.doc.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_1.doc.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_2.doc.malware +OLE:MASI--- \MalwareZoo\VBA\samples\DRIDEX_3.doc.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_4.doc.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_5.doc.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_6.doc.malware +OLE:MAS---- \MalwareZoo\VBA\samples\DRIDEX_7.doc.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_8.doc.malware +OLE:MASIHBD \MalwareZoo\VBA\samples\DRIDEX_9.xls.malware +OLE:MASIH-- \MalwareZoo\VBA\samples\DRIDEX_A.doc.malware +OLE:------- \MalwareZoo\VBA\samples\Normal_Document.doc +OLE:M------ \MalwareZoo\VBA\samples\Normal_Document_Macro.doc +OpX:MASI--- \MalwareZoo\VBA\samples\RottenKitten.xlsb.malware +OLE:MASI-B- \MalwareZoo\VBA\samples\ROVNIX.doc.malware +OLE:MA----- \MalwareZoo\VBA\samples\Word within Word macro auto.doc +``` + +## Python 3 support - olevba3 + +As of v0.50, olevba has been ported to Python 3 thanks to @sebdraven. +However, the differences between Python 2 and 3 are significant and for now +there is a separate version of olevba named olevba3 to be used with +Python 3. + +-------------------------------------------------------------------------- + +## How to use olevba in Python applications + +olevba may be used to open a MS Office file, detect if it contains VBA macros, extract and analyze the VBA source code +from your own python applications. + +IMPORTANT: olevba is currently under active development, therefore this API is likely to change. + +### Import olevba + +First, import the **oletools.olevba** package, using at least the VBA_Parser and VBA_Scanner classes: + +```python +from oletools.olevba import VBA_Parser, TYPE_OLE, TYPE_OpenXML, TYPE_Word2003_XML, TYPE_MHTML +``` + +### Parse a MS Office file - VBA_Parser + +To parse a file on disk, create an instance of the **VBA_Parser** class, providing the name of the file to open as parameter. +For example: + +```python +vbaparser = VBA_Parser('my_file_with_macros.doc') +``` + +The file may also be provided as a bytes string containing its data. In that case, the actual +filename must be provided for reference, and the file content with the data parameter. For example: + +```python +myfile = 'my_file_with_macros.doc' +filedata = open(myfile, 'rb').read() +vbaparser = VBA_Parser(myfile, data=filedata) +``` +VBA_Parser will raise an exception if the file is not a supported format, such as OLE (MS Office 97-2003), OpenXML +(MS Office 2007+), MHTML or Word 2003 XML. + +After parsing the file, the attribute **VBA_Parser.type** is a string indicating the file type. +It can be either TYPE_OLE, TYPE_OpenXML, TYPE_Word2003_XML or TYPE_MHTML. (constants defined in the olevba module) + +### Detect VBA macros + +The method **detect_vba_macros** of a VBA_Parser object returns True if VBA macros have been found in the file, +False otherwise. + +```python +if vbaparser.detect_vba_macros(): + print 'VBA Macros found' +else: + print 'No VBA Macros found' +``` +Note: The detection algorithm looks for streams and storage with specific names in the OLE structure, which works fine +for all the supported formats listed above. However, for some formats such as PowerPoint 97-2003, this method will +always return False because VBA Macros are stored in a different way which is not yet supported by olevba. + +Moreover, if the file contains an embedded document (e.g. an Excel workbook inserted into a Word document), this method +may return True if the embedded document contains VBA Macros, even if the main document does not. + +### Extract VBA Macro Source Code + +The method **extract_macros** extracts and decompresses source code for each VBA macro found in the file (possibly +including embedded files). It is a generator yielding a tuple (filename, stream_path, vba_filename, vba_code) +for each VBA macro found. + +- filename: If the file is OLE (MS Office 97-2003), filename is the path of the file. + If the file is OpenXML (MS Office 2007+), filename is the path of the OLE subfile containing VBA macros within the zip archive, + e.g. word/vbaProject.bin. +- stream_path: path of the OLE stream containing the VBA macro source code +- vba_filename: corresponding VBA filename +- vba_code: string containing the VBA source code in clear text + +Example: + +```python +for (filename, stream_path, vba_filename, vba_code) in vbaparser.extract_macros(): + print '-'*79 + print 'Filename :', filename + print 'OLE stream :', stream_path + print 'VBA filename:', vba_filename + print '- '*39 + print vba_code +``` +Alternatively, the VBA_Parser method **extract_all_macros** returns the same results as a list of tuples. + +### Analyze VBA Source Code + +Since version 0.40, the VBA_Parser class provides simpler methods than VBA_Scanner to analyze all macros contained +in a file: + +The method **analyze_macros** from the class **VBA_Parser** can be used to scan the source code of all +VBA modules to find obfuscated strings, suspicious keywords, IOCs, auto-executable macros, etc. + +analyze_macros() takes an optional argument show_decoded_strings: if set to True, the results will contain all the encoded +strings found in the code (Hex, Base64, Dridex) with their decoded value. +By default, it will only include the strings which contain printable characters. + +**VBA_Parser.analyze_macros()** returns a list of tuples (type, keyword, description), one for each item in the results. + +- type may be either 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String', 'Dridex String' or + 'VBA obfuscated Strings'. +- keyword is the string found for auto-executable macros, suspicious keywords or IOCs. For obfuscated strings, it is + the decoded value of the string. +- description provides a description of the keyword. For obfuscated strings, it is the encoded value of the string. + +Example: + +```python +results = vbaparser.analyze_macros() +for kw_type, keyword, description in results: + print 'type=%s - keyword=%s - description=%s' % (kw_type, keyword, description) +``` +After calling analyze_macros, the following VBA_Parser attributes also provide the number +of items found for each category: + +```python +print 'AutoExec keywords: %d' % vbaparser.nb_autoexec +print 'Suspicious keywords: %d' % vbaparser.nb_suspicious +print 'IOCs: %d' % vbaparser.nb_iocs +print 'Hex obfuscated strings: %d' % vbaparser.nb_hexstrings +print 'Base64 obfuscated strings: %d' % vbaparser.nb_base64strings +print 'Dridex obfuscated strings: %d' % vbaparser.nb_dridexstrings +print 'VBA obfuscated strings: %d' % vbaparser.nb_vbastrings +``` + +### Deobfuscate VBA Macro Source Code + +The method **reveal** attempts to deobfuscate the macro source code by replacing all +the obfuscated strings by their decoded content. Returns a single string. + +Example: + +```python +print vbaparser.reveal() +``` + +### Close the VBA_Parser + +After usage, it is better to call the **close** method of the VBA_Parser object, to make sure the file is closed, +especially if your application is parsing many files. + +```python +vbaparser.close() +``` + +-------------------------------------------------------------------------- + +## Deprecated API + +The following methods and functions are still functional, but their usage is not recommended +since they have been replaced by better solutions. + +### VBA_Scanner (deprecated) + +The class **VBA_Scanner** can be used to scan the source code of a VBA module to find obfuscated strings, +suspicious keywords, IOCs, auto-executable macros, etc. + +First, create a VBA_Scanner object with a string containing the VBA source code (for example returned by the +extract_macros method). Then call the methods **scan** or **scan_summary** to get the results of the analysis. + +scan() takes an optional argument include_decoded_strings: if set to True, the results will contain all the encoded +strings found in the code (Hex, Base64, Dridex) with their decoded value. + +**scan** returns a list of tuples (type, keyword, description), one for each item in the results. + +- type may be either 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String'. +- keyword is the string found for auto-executable macros, suspicious keywords or IOCs. For obfuscated strings, it is + the decoded value of the string. +- description provides a description of the keyword. For obfuscated strings, it is the encoded value of the string. + +Example: + +```python +vba_scanner = VBA_Scanner(vba_code) +results = vba_scanner.scan(include_decoded_strings=True) +for kw_type, keyword, description in results: + print 'type=%s - keyword=%s - description=%s' % (kw_type, keyword, description) +``` +The function **scan_vba** is a shortcut for VBA_Scanner(vba_code).scan(): + +```python +results = scan_vba(vba_code, include_decoded_strings=True) +for kw_type, keyword, description in results: + print 'type=%s - keyword=%s - description=%s' % (kw_type, keyword, description) +``` +**scan_summary** returns a tuple with the number of items found for each category: +(autoexec, suspicious, IOCs, hex, base64, dridex). + + +### Detect auto-executable macros (deprecated) + +**Deprecated**: It is preferable to use either scan_vba or VBA_Scanner to get all results at once. + +The function **detect_autoexec** checks if VBA macro code contains specific macro names +that will be triggered when the document/workbook is opened, closed, changed, etc. + +It returns a list of tuples containing two strings, the detected keyword, and the +description of the trigger. (See the malware example above) + +Sample usage: + +```python +from oletools.olevba import detect_autoexec +autoexec_keywords = detect_autoexec(vba_code) +if autoexec_keywords: + print 'Auto-executable macro keywords found:' + for keyword, description in autoexec_keywords: + print '%s: %s' % (keyword, description) +else: + print 'Auto-executable macro keywords: None found' +``` + +### Detect suspicious VBA keywords (deprecated) + +**Deprecated**: It is preferable to use either scan_vba or VBA_Scanner to get all results at once. + +The function **detect_suspicious** checks if VBA macro code contains specific +keywords often used by malware to act on the system (create files, run +commands or applications, write to the registry, etc). + +It returns a list of tuples containing two strings, the detected keyword, and the +description of the corresponding malicious behaviour. (See the malware example above) + +Sample usage: + +```python +from oletools.olevba import detect_suspicious +suspicious_keywords = detect_suspicious(vba_code) +if suspicious_keywords: + print 'Suspicious VBA keywords found:' + for keyword, description in suspicious_keywords: + print '%s: %s' % (keyword, description) +else: + print 'Suspicious VBA keywords: None found' +``` + +### Extract potential IOCs (deprecated) + +**Deprecated**: It is preferable to use either scan_vba or VBA_Scanner to get all results at once. + +The function **detect_patterns** checks if VBA macro code contains specific +patterns of interest, that may be useful for malware analysis and detection +(potential Indicators of Compromise): IP addresses, e-mail addresses, +URLs, executable file names. + +It returns a list of tuples containing two strings, the pattern type, and the +extracted value. (See the malware example above) + +Sample usage: + +```python +from oletools.olevba import detect_patterns +patterns = detect_patterns(vba_code) +if patterns: + print 'Patterns found:' + for pattern_type, value in patterns: + print '%s: %s' % (pattern_type, value) +else: + print 'Patterns: None found' +``` + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.html remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.html --- remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,83 @@ + + + + + + + + + + +

pyxswf

pyxswf is a script to detect, extract and analyze Flash objects (SWF files) that may be embedded in files such as MS Office documents (e.g. Word, Excel), which is especially useful for malware analysis.

It is part of the python-oletools package.

pyxswf is an extension to xxxswf.py published by Alexander Hanel.

Compared to xxxswf, it can extract streams from MS Office documents by parsing their OLE structure properly, which is necessary when streams are fragmented. Stream fragmentation is a known obfuscation technique, as explained on http://www.breakingpointsystems.com/resources/blog/evasion-with-ole2-fragmentation/

It can also extract Flash objects from RTF documents, by parsing embedded objects encoded in hexadecimal format (-f option).

For this, simply add the -o option to work on OLE streams rather than raw files, or the -f option to work on RTF files.

Usage

Usage: pyxswf.py [options] <file.bad>
+
+Options:
+  -o, --ole             Parse an OLE file (e.g. Word, Excel) to look for SWF
+                        in each stream
+  -f, --rtf             Parse an RTF file to look for SWF in each embedded
+                        object
+  -x, --extract         Extracts the embedded SWF(s), names it MD5HASH.swf &
+                        saves it in the working dir. No addition args needed
+  -h, --help            show this help message and exit
+  -y, --yara            Scans the SWF(s) with yara. If the SWF(s) is
+                        compressed it will be deflated. No addition args
+                        needed
+  -s, --md5scan         Scans the SWF(s) for MD5 signatures. Please see func
+                        checkMD5 to define hashes. No addition args needed
+  -H, --header          Displays the SWFs file header. No addition args needed
+  -d, --decompress      Deflates compressed SWFS(s)
+  -r PATH, --recdir=PATH
+                        Will recursively scan a directory for files that
+                        contain SWFs. Must provide path in quotes
+  -c, --compress        Compresses the SWF using Zlib

Example 1 - detecting and extracting a SWF file from a Word document on Windows:

C:\oletools>pyxswf.py -o word_flash.doc
+OLE stream: 'Contents'
+[SUMMARY] 1 SWF(s) in MD5:993664cc86f60d52d671b6610813cfd1:Contents
+        [ADDR] SWF 1 at 0x8  - FWS Header
+
+C:\oletools>pyxswf.py -xo word_flash.doc
+OLE stream: 'Contents'
+[SUMMARY] 1 SWF(s) in MD5:993664cc86f60d52d671b6610813cfd1:Contents
+        [ADDR] SWF 1 at 0x8  - FWS Header
+                [FILE] Carved SWF MD5: 2498e9c0701dc0e461ab4358f9102bc5.swf

Example 2 - detecting and extracting a SWF file from a RTF document on Windows:

C:\oletools>pyxswf.py -xf "rtf_flash.rtf"
+RTF embedded object size 1498557 at index 000036DD
+[SUMMARY] 1 SWF(s) in MD5:46a110548007e04f4043785ac4184558:RTF_embedded_object_0
+00036DD
+        [ADDR] SWF 1 at 0xc40  - FWS Header
+                [FILE] Carved SWF MD5: 2498e9c0701dc0e461ab4358f9102bc5.swf

How to use pyxswf in Python applications

TODO

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.md remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.md --- remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/pyxswf.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,97 @@ +pyxswf +====== + +pyxswf is a script to detect, extract and analyze Flash objects (SWF files) that may +be embedded in files such as MS Office documents (e.g. Word, Excel), +which is especially useful for malware analysis. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +pyxswf is an extension to [xxxswf.py](http://hooked-on-mnemonics.blogspot.nl/2011/12/xxxswfpy.html) published by Alexander Hanel. + +Compared to xxxswf, it can extract streams from MS Office documents by parsing +their OLE structure properly, which is necessary when streams are fragmented. +Stream fragmentation is a known obfuscation technique, as explained on +[http://www.breakingpointsystems.com/resources/blog/evasion-with-ole2-fragmentation/](http://web.archive.org/web/20121118021207/http://www.breakingpointsystems.com/resources/blog/evasion-with-ole2-fragmentation/) + +It can also extract Flash objects from RTF documents, by parsing embedded objects encoded in hexadecimal format (-f option). + +For this, simply add the -o option to work on OLE streams rather than raw files, or the -f option to work on RTF files. + +## Usage + +```text +Usage: pyxswf.py [options] + +Options: + -o, --ole Parse an OLE file (e.g. Word, Excel) to look for SWF + in each stream + -f, --rtf Parse an RTF file to look for SWF in each embedded + object + -x, --extract Extracts the embedded SWF(s), names it MD5HASH.swf & + saves it in the working dir. No addition args needed + -h, --help show this help message and exit + -y, --yara Scans the SWF(s) with yara. If the SWF(s) is + compressed it will be deflated. No addition args + needed + -s, --md5scan Scans the SWF(s) for MD5 signatures. Please see func + checkMD5 to define hashes. No addition args needed + -H, --header Displays the SWFs file header. No addition args needed + -d, --decompress Deflates compressed SWFS(s) + -r PATH, --recdir=PATH + Will recursively scan a directory for files that + contain SWFs. Must provide path in quotes + -c, --compress Compresses the SWF using Zlib +``` + +### Example 1 - detecting and extracting a SWF file from a Word document on Windows: + +```text +C:\oletools>pyxswf.py -o word_flash.doc +OLE stream: 'Contents' +[SUMMARY] 1 SWF(s) in MD5:993664cc86f60d52d671b6610813cfd1:Contents + [ADDR] SWF 1 at 0x8 - FWS Header + +C:\oletools>pyxswf.py -xo word_flash.doc +OLE stream: 'Contents' +[SUMMARY] 1 SWF(s) in MD5:993664cc86f60d52d671b6610813cfd1:Contents + [ADDR] SWF 1 at 0x8 - FWS Header + [FILE] Carved SWF MD5: 2498e9c0701dc0e461ab4358f9102bc5.swf +``` + +### Example 2 - detecting and extracting a SWF file from a RTF document on Windows: + +```text +C:\oletools>pyxswf.py -xf "rtf_flash.rtf" +RTF embedded object size 1498557 at index 000036DD +[SUMMARY] 1 SWF(s) in MD5:46a110548007e04f4043785ac4184558:RTF_embedded_object_0 +00036DD + [ADDR] SWF 1 at 0xc40 - FWS Header + [FILE] Carved SWF MD5: 2498e9c0701dc0e461ab4358f9102bc5.swf +``` + +## How to use pyxswf in Python applications + +TODO + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.html remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.html --- remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,96 @@ + + + + + + + + + + + +

rtfobj

rtfobj is a Python module to detect and extract embedded objects stored in RTF files, such as OLE objects. It can also detect OLE Package objects, and extract the embedded files.

Since v0.50, rtfobj contains a custom RTF parser that has been designed to match MS Word's behaviour, in order to handle obfuscated RTF files. See my article "Anti-Analysis Tricks in Weaponized RTF" for some concrete examples.

rtfobj can be used as a Python library or a command-line tool.

It is part of the python-oletools package.

Usage

rtfobj [options] <filename> [filename2 ...]
+
+Options:
+  -h, --help            show this help message and exit
+  -r                    find files recursively in subdirectories.
+  -z ZIP_PASSWORD, --zip=ZIP_PASSWORD
+                        if the file is a zip archive, open first file from it,
+                        using the provided password (requires Python 2.6+)
+  -f ZIP_FNAME, --zipfname=ZIP_FNAME
+                        if the file is a zip archive, file(s) to be opened
+                        within the zip. Wildcards * and ? are supported.
+                        (default:*)
+  -l LOGLEVEL, --loglevel=LOGLEVEL
+                        logging level debug/info/warning/error/critical
+                        (default=warning)
+  -s SAVE_OBJECT, --save=SAVE_OBJECT
+                        Save the object corresponding to the provided number
+                        to a file, for example "-s 2". Use "-s all" to save
+                        all objects at once.
+  -d OUTPUT_DIR         use specified directory to save output files.

rtfobj displays a list of the OLE and Package objects that have been detected, with their attributes such as class and filename.

When an OLE Package object contains an executable file or script, it is highlighted as such. For example:

To extract an object or file, use the option -s followed by the object number as shown in the table.

Example:

rtfobj -s 0

It extracts and decodes the corresponding object, and saves it as a file named "object_xxxx.bin", xxxx being the location of the object in the RTF file.

How to use rtfobj in Python applications

As of v0.50, the API has changed significantly and it is not final yet. For now, see the class RtfObjectParser in the code.

Deprecated API (still functional):

rtf_iter_objects(filename) is an iterator which yields a tuple (index, orig_len, object) providing the index of each hexadecimal stream in the RTF file, and the corresponding decoded object.

Example:

from oletools import rtfobj
+for index, orig_len, data in rtfobj.rtf_iter_objects("myfile.rtf"):
+    print('found object size %d at index %08X' % (len(data), index))

python-oletools documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
Tools: +
- olebrowse
- oleid
- olemeta
- oletimes
- oledir
- olemap
- olevba
- mraptor
- pyxswf
- oleobj
- rtfobj

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.md remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.md --- remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/doc/rtfobj.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,102 @@ +rtfobj +====== + +rtfobj is a Python module to detect and extract embedded objects stored +in RTF files, such as OLE objects. It can also detect OLE Package objects, +and extract the embedded files. + +Since v0.50, rtfobj contains a custom RTF parser that has been designed to +match MS Word's behaviour, in order to handle obfuscated RTF files. See my +article ["Anti-Analysis Tricks in Weaponized RTF"](http://decalage.info/rtf_tricks) +for some concrete examples. + +rtfobj can be used as a Python library or a command-line tool. + +It is part of the [python-oletools](http://www.decalage.info/python/oletools) package. + +## Usage + +```text +rtfobj [options] [filename2 ...] + +Options: + -h, --help show this help message and exit + -r find files recursively in subdirectories. + -z ZIP_PASSWORD, --zip=ZIP_PASSWORD + if the file is a zip archive, open first file from it, + using the provided password (requires Python 2.6+) + -f ZIP_FNAME, --zipfname=ZIP_FNAME + if the file is a zip archive, file(s) to be opened + within the zip. Wildcards * and ? are supported. + (default:*) + -l LOGLEVEL, --loglevel=LOGLEVEL + logging level debug/info/warning/error/critical + (default=warning) + -s SAVE_OBJECT, --save=SAVE_OBJECT + Save the object corresponding to the provided number + to a file, for example "-s 2". Use "-s all" to save + all objects at once. + -d OUTPUT_DIR use specified directory to save output files. +``` + +rtfobj displays a list of the OLE and Package objects that have been detected, +with their attributes such as class and filename. + +When an OLE Package object contains an executable file or script, it is +highlighted as such. For example: + +![](rtfobj1.png) + +To extract an object or file, use the option -s followed by the object number +as shown in the table. + +Example: + +```text +rtfobj -s 0 +``` + +It extracts and decodes the corresponding object, and saves it as a file +named "object_xxxx.bin", xxxx being the location of the object in the RTF file. + + +## How to use rtfobj in Python applications + +As of v0.50, the API has changed significantly and it is not final yet. +For now, see the class RtfObjectParser in the code. + +### Deprecated API (still functional): + +rtf_iter_objects(filename) is an iterator which yields a tuple +(index, orig_len, object) providing the index of each hexadecimal stream +in the RTF file, and the corresponding decoded object. + +Example: + +```python +from oletools import rtfobj +for index, orig_len, data in rtfobj.rtf_iter_objects("myfile.rtf"): + print('found object size %d at index %08X' % (len(data), index)) +``` + +-------------------------------------------------------------------------- + +python-oletools documentation +----------------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- Tools: + - [[olebrowse]] + - [[oleid]] + - [[olemeta]] + - [[oletimes]] + - [[oledir]] + - [[olemap]] + - [[olevba]] + - [[mraptor]] + - [[pyxswf]] + - [[oleobj]] + - [[rtfobj]] diff -Nru remnux-oletools-0.51a/remnux-oletools/ezhexviewer.py remnux-oletools-0.51a/remnux-oletools/ezhexviewer.py --- remnux-oletools-0.51a/remnux-oletools/ezhexviewer.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/ezhexviewer.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,157 @@ +#!/usr/bin/env python +""" +ezhexviewer.py + +A simple hexadecimal viewer based on easygui. It should work on any platform +with Python 2.x or 3.x. + +Usage: ezhexviewer.py [file] + +Usage in a python application: + + import ezhexviewer + ezhexviewer.hexview_file(filename) + ezhexviewer.hexview_data(data) + + +ezhexviewer project website: http://www.decalage.info/python/ezhexviewer + +ezhexviewer is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info) +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2012-09-17 v0.01 PL: - first version +# 2012-10-04 v0.02 PL: - added license +# 2016-09-06 v0.50 PL: - added main function for entry points in setup.py +# 2016-10-26 PL: - fixed to run on Python 2+3 + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: +# + options to set title and msg + + +from thirdparty.easygui import easygui +import sys + +# === PYTHON 2+3 SUPPORT ====================================================== + +if sys.version_info[0] >= 3: + # Python 3 specific adaptations + # py3 range = py2 xrange + xrange = range + PYTHON3 = True +else: + PYTHON3 = False + +def xord(char): + ''' + workaround for ord() to work on characters from a bytes string with + Python 2 and 3. If s is a bytes string, s[i] is a bytes string of + length 1 on Python 2, but it is an integer on Python 3... + xord(c) returns ord(c) if c is a bytes string, or c if it is already + an integer. + :param char: int or bytes of length 1 + :return: ord(c) if bytes, c if int + ''' + if isinstance(char, int): + return char + else: + return ord(char) + +def bchr(x): + ''' + workaround for chr() to return a bytes string of length 1 with + Python 2 and 3. On Python 3, chr returns a unicode string, but + on Python 2 it is a bytes string. + bchr() always returns a bytes string on Python 2+3. + :param x: int + :return: chr(x) as a bytes string + ''' + if PYTHON3: + # According to the Python 3 documentation, bytes() can be + # initialized with an iterable: + return bytes([x]) + else: + return chr(x) + +#------------------------------------------------------------------------------ +# The following code (hexdump3 only) is a modified version of the hex dumper +# recipe published on ASPN by Sebastien Keim and Raymond Hattinger under the +# PSF license. I added the startindex parameter. +# see http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/142812 +# PSF license: http://docs.python.org/license.html +# Copyright (c) 2001-2012 Python Software Foundation; All Rights Reserved + +FILTER = b''.join([(len(repr(bchr(x)))<=4 and x != 0x0A) and bchr(x) or b'.' for x in range(256)]) + +def hexdump3(src, length=8, startindex=0): + """ + Returns a hexadecimal dump of a binary string. + length: number of bytes per row. + startindex: index of 1st byte. + """ + result=[] + for i in xrange(0, len(src), length): + s = src[i:i+length] + hexa = ' '.join(["%02X" % xord(x) for x in s]) + printable = s.translate(FILTER) + if PYTHON3: + # On Python 3, need to convert printable from bytes to str: + printable = printable.decode('latin1') + result.append("%08X %-*s %s\n" % (i+startindex, length*3, hexa, printable)) + return ''.join(result) + +# end of PSF-licensed code. +#------------------------------------------------------------------------------ + + +def hexview_data (data, msg='', title='ezhexviewer', length=16, startindex=0): + hex = hexdump3(data, length=length, startindex=startindex) + easygui.codebox(msg=msg, title=title, text=hex) + + +def hexview_file (filename, msg='', title='ezhexviewer', length=16, startindex=0): + data = open(filename, 'rb').read() + hexview_data(data, msg=msg, title=title, length=length, startindex=startindex) + + +# === MAIN =================================================================== + +def main(): + try: + filename = sys.argv[1] + except: + filename = easygui.fileopenbox() + if filename: + try: + hexview_file(filename, msg='File: %s' % filename) + except: + easygui.exceptionbox(msg='Error:', title='ezhexviewer') + + +if __name__ == '__main__': + main() \ No newline at end of file diff -Nru remnux-oletools-0.51a/remnux-oletools/LICENSE.txt remnux-oletools-0.51a/remnux-oletools/LICENSE.txt --- remnux-oletools-0.51a/remnux-oletools/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,54 @@ +LICENSE for the python-oletools package: + +This license applies to the python-oletools package, apart from the thirdparty +folder which contains third-party files published with their own license. + +The python-oletools package is copyright (c) 2012-2016 Philippe Lagadec (http://www.decalage.info) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +---------- + +olevba contains modified source code from the officeparser project, published +under the following MIT License (MIT): + +officeparser is copyright (c) 2014 John William Davison + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff -Nru remnux-oletools-0.51a/remnux-oletools/mraptor3.py remnux-oletools-0.51a/remnux-oletools/mraptor3.py --- remnux-oletools-0.51a/remnux-oletools/mraptor3.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/mraptor3.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,335 @@ +#!/usr/bin/env python +""" +mraptor.py - MacroRaptor + +MacroRaptor is a script to parse OLE and OpenXML files such as MS Office +documents (e.g. Word, Excel), to detect malicious macros. + +Supported formats: +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) +- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) +- Word 2003 XML (.xml) +- Word/Excel Single File Web Page / MHTML (.mht) + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +MacroRaptor is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +# === LICENSE ================================================================== + +# MacroRaptor is copyright (c) 2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2016-02-23 v0.01 PL: - first version +# 2016-02-29 v0.02 PL: - added Workbook_Activate, FileSaveAs +# 2016-03-04 v0.03 PL: - returns an exit code based on the overall result +# 2016-03-08 v0.04 PL: - collapse long lines before analysis +# 2016-07-19 v0.50 SL: - converted to Python 3 +# 2016-08-26 PL: - changed imports for Python 3 + +__version__ = '0.50py3' + +#------------------------------------------------------------------------------ +# TODO: + + +#--- IMPORTS ------------------------------------------------------------------ + +import sys, logging, optparse, re + +from thirdparty.xglob import xglob + +# import the python 3 version of tablestream: +from thirdparty.tablestream import tablestream + +# import the python 3 version of olevba +import olevba3 as olevba + +# === LOGGING ================================================================= + +# a global logger object used for debugging: +log = olevba.get_logger('mraptor') + + +#--- CONSTANTS ---------------------------------------------------------------- + +# URL and message to report issues: +# TODO: make it a common variable for all oletools +URL_ISSUES = 'https://github.com/decalage2/oletools/issues' +MSG_ISSUES = 'Please report this issue on %s' % URL_ISSUES + +# 'AutoExec', 'AutoOpen', 'Auto_Open', 'AutoClose', 'Auto_Close', 'AutoNew', 'AutoExit', +# 'Document_Open', 'DocumentOpen', +# 'Document_Close', 'DocumentBeforeClose', +# 'DocumentChange','Document_New', +# 'NewDocument' +# 'Workbook_Open', 'Workbook_Close', + +# TODO: check if line also contains Sub or Function +re_autoexec = re.compile(r'(?i)\b(?:Auto(?:Exec|_?Open|_?Close|Exit|New)' + + r'|Document(?:_?Open|_Close|BeforeClose|Change|_New)' + + r'|NewDocument|Workbook(?:_Open|_Activate|_Close))\b') + +# MS-VBAL 5.4.5.1 Open Statement: +RE_OPEN_WRITE = r'(?:\bOpen\b[^\n]+\b(?:Write|Append|Binary|Output|Random)\b)' + +re_write = re.compile(r'(?i)\b(?:FileCopy|CopyFile|Kill|CreateTextFile|' + + r'VirtualAlloc|RtlMoveMemory|URLDownloadToFileA?|AltStartupPath|' + + r'ADODB\.Stream|WriteText|SaveToFile|SaveAs|SaveAsRTF|FileSaveAs|MkDir|RmDir|SaveSetting|SetAttr)\b|' + RE_OPEN_WRITE) + +# MS-VBAL 5.2.3.5 External Procedure Declaration +RE_DECLARE_LIB = r'(?:\bDeclare\b[^\n]+\bLib\b)' + +re_execute = re.compile(r'(?i)\b(?:Shell|CreateObject|GetObject|SendKeys|' + + r'MacScript|FollowHyperlink|CreateThread|ShellExecute)\b|' + RE_DECLARE_LIB) + +# short tag to display file types in triage mode: +TYPE2TAG = { + olevba.TYPE_OLE: 'OLE', + olevba.TYPE_OpenXML: 'OpX', + olevba.TYPE_Word2003_XML: 'XML', + olevba.TYPE_MHTML: 'MHT', + olevba.TYPE_TEXT: 'TXT', +} + + +# === CLASSES ================================================================= + +class Result_NoMacro(object): + exit_code = 0 + color = 'green' + name = 'No Macro' + + +class Result_NotMSOffice(object): + exit_code = 1 + color = 'green' + name = 'Not MS Office' + + +class Result_MacroOK(object): + exit_code = 2 + color = 'cyan' + name = 'Macro OK' + + +class Result_Error(object): + exit_code = 10 + color = 'yellow' + name = 'ERROR' + + +class Result_Suspicious(object): + exit_code = 20 + color = 'red' + name = 'SUSPICIOUS' + + +class MacroRaptor(object): + """ + class to scan VBA macro code to detect if it is malicious + """ + def __init__(self, vba_code): + """ + MacroRaptor constructor + :param vba_code: string containing the VBA macro code + """ + # collapse long lines first + self.vba_code = olevba.vba_collapse_long_lines(vba_code) + self.autoexec = False + self.write = False + self.execute = False + self.flags = '' + self.suspicious = False + self.autoexec_match = None + self.write_match = None + self.execute_match = None + self.matches = [] + + def scan(self): + """ + Scan the VBA macro code to detect if it is malicious + :return: + """ + m = re_autoexec.search(self.vba_code) + if m is not None: + self.autoexec = True + self.autoexec_match = m.group() + self.matches.append(m.group()) + m = re_write.search(self.vba_code) + if m is not None: + self.write = True + self.write_match = m.group() + self.matches.append(m.group()) + m = re_execute.search(self.vba_code) + if m is not None: + self.execute = True + self.execute_match = m.group() + self.matches.append(m.group()) + if self.autoexec and (self.execute or self.write): + self.suspicious = True + + def get_flags(self): + flags = '' + flags += 'A' if self.autoexec else '-' + flags += 'W' if self.write else '-' + flags += 'X' if self.execute else '-' + return flags + + +# === MAIN ==================================================================== + +def main(): + """ + Main function, called when olevba is run from the command line + """ + global log + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + parser.add_option("-m", '--matches', action="store_true", dest="show_matches", + help='Show matched strings.') + + # TODO: add logfile option + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print(__doc__) + parser.print_help() + print('\nAn exit code is returned based on the analysis result:') + for result in (Result_NoMacro, Result_NotMSOffice, Result_MacroOK, Result_Error, Result_Suspicious): + print(' - %d: %s' % (result.exit_code, result.name)) + sys.exit() + + # print banner with version + print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__) + print('This is work in progress, please report issues at %s' % URL_ISSUES) + + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + + t = tablestream.TableStream(style=tablestream.TableStyleSlim, + header_row=['Result', 'Flags', 'Type', 'File'], + column_width=[10, 5, 4, 56]) + + exitcode = -1 + global_result = None + # TODO: handle errors in xglob, to continue processing the next files + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + full_name = '%s in %s' % (filename, container) if container else filename + # try: + # # Open the file + # if data is None: + # data = open(filename, 'rb').read() + # except: + # log.exception('Error when opening file %r' % full_name) + # continue + if isinstance(data, Exception): + result = Result_Error + t.write_row([result.name, '', '', full_name], + colors=[result.color, None, None, None]) + t.write_row(['', '', '', str(data)], + colors=[None, None, None, result.color]) + else: + filetype = '???' + try: + vba_parser = olevba.VBA_Parser(filename=filename, data=data, container=container) + filetype = TYPE2TAG[vba_parser.type] + except Exception as e: + # log.error('Error when parsing VBA macros from file %r' % full_name) + # TODO: distinguish actual errors from non-MSOffice files + result = Result_Error + t.write_row([result.name, '', filetype, full_name], + colors=[result.color, None, None, None]) + t.write_row(['', '', '', str(e)], + colors=[None, None, None, result.color]) + continue + if vba_parser.detect_vba_macros(): + vba_code_all_modules = '' + try: + for (subfilename, stream_path, vba_filename, vba_code) in vba_parser.extract_all_macros(): + vba_code_all_modules += vba_code.decode('utf-8','replace') + '\n' + except Exception as e: + # log.error('Error when parsing VBA macros from file %r' % full_name) + result = Result_Error + t.write_row([result.name, '', TYPE2TAG[vba_parser.type], full_name], + colors=[result.color, None, None, None]) + t.write_row(['', '', '', str(e)], + colors=[None, None, None, result.color]) + continue + mraptor = MacroRaptor(vba_code_all_modules) + mraptor.scan() + if mraptor.suspicious: + result = Result_Suspicious + else: + result = Result_MacroOK + t.write_row([result.name, mraptor.get_flags(), filetype, full_name], + colors=[result.color, None, None, None]) + if mraptor.matches and options.show_matches: + t.write_row(['', '', '', 'Matches: %r' % mraptor.matches]) + else: + result = Result_NoMacro + t.write_row([result.name, '', filetype, full_name], + colors=[result.color, None, None, None]) + if result.exit_code > exitcode: + global_result = result + exitcode = result.exit_code + + print('') + print('Flags: A=AutoExec, W=Write, X=Execute') + print('Exit code: %d - %s' % (exitcode, global_result.name)) + sys.exit(exitcode) + +if __name__ == '__main__': + main() + +# Soundtrack: "Dark Child" by Marlon Williams diff -Nru remnux-oletools-0.51a/remnux-oletools/mraptor_milter.py remnux-oletools-0.51a/remnux-oletools/mraptor_milter.py --- remnux-oletools-0.51a/remnux-oletools/mraptor_milter.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/mraptor_milter.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,399 @@ +#!/usr/bin/env python +""" +mraptor_milter + +mraptor_milter is a milter script for the Sendmail and Postfix e-mail +servers. It parses MS Office documents (e.g. Word, Excel) to detect +malicious macros. Documents with malicious macros are removed and +replaced by harmless text files. + +Supported formats: +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) +- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) +- Word 2003 XML (.xml) +- Word/Excel Single File Web Page / MHTML (.mht) +- Publisher (.pub) + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +mraptor_milter is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +# === LICENSE ================================================================== + +# mraptor_milter is copyright (c) 2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# --- CHANGELOG -------------------------------------------------------------- +# 2016-08-08 v0.01 PL: - first version +# 2016-08-12 v0.02 PL: - added logging to file with time rotation +# - archive each e-mail to a file before filtering +# 2016-08-30 v0.03 PL: - added daemonize to run as a Unix daemon +# 2016-09-06 v0.50 PL: - fixed issue #20, is_zipfile on Python 2.6 + +__version__ = '0.50' + +# --- TODO ------------------------------------------------------------------- + +# TODO: option to run in the foreground for troubleshooting +# TODO: option to write logs to the console +# TODO: options to set listening port and interface +# TODO: config file for all parameters +# TODO: option to run as a non-privileged user +# TODO: handle files in archives + + +# --- IMPORTS ---------------------------------------------------------------- + +import Milter +import io +import time +import email +import sys +import os +import logging +import logging.handlers +import datetime +import StringIO + +from socket import AF_INET6 + +from oletools import olevba, mraptor + +from Milter.utils import parse_addr + +if sys.version_info[0] <= 2: + # Python 2.x + if sys.version_info[1] <= 6: + # Python 2.6 + # use is_zipfile backported from Python 2.7: + from oletools.thirdparty.zipfile27 import is_zipfile + else: + # Python 2.7 + from zipfile import is_zipfile +else: + # Python 3.x+ + from zipfile import is_zipfile + + + +# --- CONSTANTS -------------------------------------------------------------- + +# TODO: read parameters from a config file +# at postfix smtpd_milters = inet:127.0.0.1:25252 +SOCKET = "inet:25252@127.0.0.1" # bind to unix or tcp socket "inet:port@ip" or "///.sock" +TIMEOUT = 30 # Milter timeout in seconds +# CFG_DIR = "/etc/macromilter/" +# LOG_DIR = "/var/log/macromilter/" + +# TODO: different path on Windows: +LOGFILE_DIR = '/var/log/mraptor_milter' +# LOGFILE_DIR = '.' +LOGFILE_NAME = 'mraptor_milter.log' +LOGFILE_PATH = os.path.join(LOGFILE_DIR, LOGFILE_NAME) + +# Directory where to save a copy of each received e-mail: +ARCHIVE_DIR = '/var/log/mraptor_milter' +# ARCHIVE_DIR = '.' + +# file to store PID for daemonize +PIDFILE = "/tmp/mraptor_milter.pid" + + + +# === LOGGING ================================================================ + +# Set up a specific logger with our desired output level +log = logging.getLogger('MRMilter') + +# disable logging by default - enable it in main app: +log.setLevel(logging.CRITICAL+1) + +# NOTE: all logging config is done in the main app, not here. + +# === CLASSES ================================================================ + +# Inspired from https://github.com/jmehnle/pymilter/blob/master/milter-template.py + +class MacroRaptorMilter(Milter.Base): + ''' + ''' + def __init__(self): + # A new instance with each new connection. + # each connection runs in its own thread and has its own myMilter + # instance. Python code must be thread safe. This is trivial if only stuff + # in myMilter instances is referenced. + self.id = Milter.uniqueID() # Integer incremented with each call. + self.message = None + self.IP = None + self.port = None + self.flow = None + self.scope = None + self.IPname = None # Name from a reverse IP lookup + + @Milter.noreply + def connect(self, IPname, family, hostaddr): + ''' + New connection (may contain several messages) + :param IPname: Name from a reverse IP lookup + :param family: IP version 4 (AF_INET) or 6 (AF_INET6) + :param hostaddr: tuple (IP, port [, flow, scope]) + :return: Milter.CONTINUE + ''' + # Examples: + # (self, 'ip068.subnet71.example.com', AF_INET, ('215.183.71.68', 4720) ) + # (self, 'ip6.mxout.example.com', AF_INET6, + # ('3ffe:80e8:d8::1', 4720, 1, 0) ) + self.IP = hostaddr[0] + self.port = hostaddr[1] + if family == AF_INET6: + self.flow = hostaddr[2] + self.scope = hostaddr[3] + else: + self.flow = None + self.scope = None + self.IPname = IPname # Name from a reverse IP lookup + self.message = None # content + log.info("[%d] connect from host %s at %s" % (self.id, IPname, hostaddr)) + return Milter.CONTINUE + + @Milter.noreply + def envfrom(self, mailfrom, *rest): + ''' + Mail From - Called at the beginning of each message within a connection + :param mailfrom: + :param str: + :return: Milter.CONTINUE + ''' + self.message = io.BytesIO() + # NOTE: self.message is only an *internal* copy of message data. You + # must use addheader, chgheader, replacebody to change the message + # on the MTA. + self.canon_from = '@'.join(parse_addr(mailfrom)) + self.message.write('From %s %s\n' % (self.canon_from, time.ctime())) + log.debug('[%d] Mail From %s %s\n' % (self.id, self.canon_from, time.ctime())) + log.debug('[%d] mailfrom=%r, rest=%r' % (self.id, mailfrom, rest)) + return Milter.CONTINUE + + @Milter.noreply + def envrcpt(self, to, *rest): + ''' + RCPT TO + :param to: + :param str: + :return: Milter.CONTINUE + ''' + log.debug('[%d] RCPT TO %r, rest=%r\n' % (self.id, to, rest)) + return Milter.CONTINUE + + @Milter.noreply + def header(self, header_field, header_field_value): + ''' + Add header + :param header_field: + :param header_field_value: + :return: Milter.CONTINUE + ''' + self.message.write("%s: %s\n" % (header_field, header_field_value)) + return Milter.CONTINUE + + @Milter.noreply + def eoh(self): + ''' + End of headers + :return: Milter.CONTINUE + ''' + self.message.write("\n") + return Milter.CONTINUE + + @Milter.noreply + def body(self, chunk): + ''' + Message body (chunked) + :param chunk: + :return: Milter.CONTINUE + ''' + self.message.write(chunk) + return Milter.CONTINUE + + def close(self): + return Milter.CONTINUE + + def abort(self): + ''' + Clean up if the connection is closed by client + :return: Milter.CONTINUE + ''' + return Milter.CONTINUE + + def archive_message(self): + ''' + Save a copy of the current message in its original form to a file + :return: nothing + ''' + date_time = datetime.datetime.utcnow().isoformat('_') + # assumption: by combining datetime + milter id, the filename should be unique: + # (the only case for duplicates is when restarting the milter twice in less than a second) + fname = 'mail_%s_%d.eml' % (date_time, self.id) + fname = os.path.join(ARCHIVE_DIR, fname) + log.debug('Saving a copy of the original message to file %r' % fname) + open(fname, 'wb').write(self.message.getvalue()) + + def eom(self): + ''' + This method is called when the end of the email message has been reached. + This event also triggers the milter specific actions + :return: Milter.ACCEPT or Milter.DISCARD if processing error + ''' + try: + # set data pointer back to 0 + self.message.seek(0) + self.archive_message() + result = self.check_mraptor() + if result is not None: + return result + else: + return Milter.ACCEPT + # if error make a fall-back to accept + except Exception: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + log.exception("[%d] Unexpected error - fall back to ACCEPT: %s %s %s" + % (self.id, exc_type, fname, exc_tb.tb_lineno)) + return Milter.ACCEPT + + def check_mraptor(self): + ''' + Check the attachments of a message using mraptor. + If an attachment is identified as suspicious, it is replaced by a simple text file. + :return: Milter.ACCEPT or Milter.DISCARD if processing error + ''' + msg = email.message_from_string(self.message.getvalue()) + result = Milter.ACCEPT + try: + for part in msg.walk(): + # for name, value in part.items(): + # log.debug(' - %s: %r' % (name, value)) + content_type = part.get_content_type() + log.debug('[%d] Content-type: %r' % (self.id, content_type)) + # TODO: handle any content-type, but check the file magic? + if not content_type.startswith('multipart'): + filename = part.get_filename(None) + log.debug('[%d] Analyzing attachment %r' % (self.id, filename)) + attachment = part.get_payload(decode=True) + attachment_lowercase = attachment.lower() + # check if this is a supported file type (if not, just skip it) + # TODO: this function should be provided by olevba + if attachment.startswith(olevba.olefile.MAGIC) \ + or is_zipfile(StringIO.StringIO(attachment)) \ + or 'http://schemas.microsoft.com/office/word/2003/wordml' in attachment \ + or ('mime' in attachment_lowercase and 'version' in attachment_lowercase + and 'multipart' in attachment_lowercase): + vba_parser = olevba.VBA_Parser(filename='message', data=attachment) + vba_code_all_modules = '' + for (subfilename, stream_path, vba_filename, vba_code) in vba_parser.extract_all_macros(): + vba_code_all_modules += vba_code + '\n' + m = mraptor.MacroRaptor(vba_code_all_modules) + m.scan() + if m.suspicious: + log.warning('[%d] The attachment %r contains a suspicious macro: replace it with a text file' + % (self.id, filename)) + part.set_payload('This attachment has been removed because it contains a suspicious macro.') + part.set_type('text/plain') + # TODO: handle case when CTE is absent + part.replace_header('Content-Transfer-Encoding', '7bit') + # for name, value in part.items(): + # log.debug(' - %s: %r' % (name, value)) + # TODO: archive filtered e-mail to a file + else: + log.debug('The attachment %r is clean.' + % filename) + except Exception: + log.exception('[%d] Error while processing the message' % self.id) + # TODO: depending on error, decide to forward the e-mail as-is or not + result = Milter.DISCARD + # TODO: only do this if the body has actually changed + body = str(msg) + self.message = io.BytesIO(body) + self.replacebody(body) + log.info('[%d] Message relayed' % self.id) + return result + + +# === MAIN =================================================================== + +def main(): + # banner + print('mraptor_milter v%s - http://decalage.info/python/oletools' % __version__) + print('logging to file %s' % LOGFILE_PATH) + print('Press Ctrl+C to stop.') + + # make sure the log directory exists: + try: + os.makedirs(LOGFILE_DIR) + except: + pass + # Add the log message handler to the logger + # log to files rotating once a day: + handler = logging.handlers.TimedRotatingFileHandler(LOGFILE_PATH, when='D', encoding='utf8') + # create formatter and add it to the handlers + formatter = logging.Formatter('%(asctime)s - %(levelname)8s: %(message)s') + handler.setFormatter(formatter) + log.addHandler(handler) + # enable logging: + log.setLevel(logging.DEBUG) + + log.info('Starting mraptor_milter v%s - listening on %s' % (__version__, SOCKET)) + log.debug('Python version: %s' % sys.version) + + # Register to have the Milter factory create instances of the class: + Milter.factory = MacroRaptorMilter + flags = Milter.CHGBODY + Milter.CHGHDRS + Milter.ADDHDRS + flags += Milter.ADDRCPT + flags += Milter.DELRCPT + Milter.set_flags(flags) # tell Sendmail which features we use + # set the "last" fall back to ACCEPT if exception occur + Milter.set_exception_policy(Milter.ACCEPT) + # start the milter + Milter.runmilter("mraptor_milter", SOCKET, TIMEOUT) + log.info('Stopping mraptor_milter.') + + +if __name__ == "__main__": + + # Using daemonize: + # See http://daemonize.readthedocs.io/en/latest/ + from daemonize import Daemonize + daemon = Daemonize(app="mraptor_milter", pid=PIDFILE, action=main) + daemon.start() + + # Using python-daemon - Does not work as-is, need to create the PID file + # See https://pypi.python.org/pypi/python-daemon/ + # See PEP-3143: https://www.python.org/dev/peps/pep-3143/ + # import daemon + # import lockfile + # with daemon.DaemonContext(pidfile=lockfile.FileLock(PIDFILE)): + # main() diff -Nru remnux-oletools-0.51a/remnux-oletools/mraptor.py remnux-oletools-0.51a/remnux-oletools/mraptor.py --- remnux-oletools-0.51a/remnux-oletools/mraptor.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/mraptor.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,337 @@ +#!/usr/bin/env python +""" +mraptor.py - MacroRaptor + +MacroRaptor is a script to parse OLE and OpenXML files such as MS Office +documents (e.g. Word, Excel), to detect malicious macros. + +Supported formats: +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) +- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) +- Word 2003 XML (.xml) +- Word/Excel Single File Web Page / MHTML (.mht) +- Publisher (.pub) + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +MacroRaptor is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +# === LICENSE ================================================================== + +# MacroRaptor is copyright (c) 2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2016-02-23 v0.01 PL: - first version +# 2016-02-29 v0.02 PL: - added Workbook_Activate, FileSaveAs +# 2016-03-04 v0.03 PL: - returns an exit code based on the overall result +# 2016-03-08 v0.04 PL: - collapse long lines before analysis +# 2016-08-31 v0.50 PL: - added macro trigger InkPicture_Painted +# 2016-09-05 PL: - added Document_BeforeClose keyword for MS Publisher (.pub) +# 2016-10-25 PL: - fixed print for Python 3 + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: + + +#--- IMPORTS ------------------------------------------------------------------ + +import sys, logging, optparse, re + +from thirdparty.xglob import xglob +from thirdparty.tablestream import tablestream + +import olevba + +# === LOGGING ================================================================= + +# a global logger object used for debugging: +log = olevba.get_logger('mraptor') + + +#--- CONSTANTS ---------------------------------------------------------------- + +# URL and message to report issues: +# TODO: make it a common variable for all oletools +URL_ISSUES = 'https://github.com/decalage2/oletools/issues' +MSG_ISSUES = 'Please report this issue on %s' % URL_ISSUES + +# 'AutoExec', 'AutoOpen', 'Auto_Open', 'AutoClose', 'Auto_Close', 'AutoNew', 'AutoExit', +# 'Document_Open', 'DocumentOpen', +# 'Document_Close', 'DocumentBeforeClose', 'Document_BeforeClose', +# 'DocumentChange','Document_New', +# 'NewDocument' +# 'Workbook_Open', 'Workbook_Close', +# *_Painted such as InkPicture1_Painted +# *_GotFocus|LostFocus|MouseHover for other ActiveX objects + +# TODO: check if line also contains Sub or Function +re_autoexec = re.compile(r'(?i)\b(?:Auto(?:Exec|_?Open|_?Close|Exit|New)' + + r'|Document(?:_?Open|_Close|_?BeforeClose|Change|_New)' + + r'|NewDocument|Workbook(?:_Open|_Activate|_Close)' + + r'|\w+_(?:Painted|GotFocus|LostFocus|MouseHover))\b') + +# MS-VBAL 5.4.5.1 Open Statement: +RE_OPEN_WRITE = r'(?:\bOpen\b[^\n]+\b(?:Write|Append|Binary|Output|Random)\b)' + +re_write = re.compile(r'(?i)\b(?:FileCopy|CopyFile|Kill|CreateTextFile|' + + r'VirtualAlloc|RtlMoveMemory|URLDownloadToFileA?|AltStartupPath|' + + r'ADODB\.Stream|WriteText|SaveToFile|SaveAs|SaveAsRTF|FileSaveAs|MkDir|RmDir|SaveSetting|SetAttr)\b|' + RE_OPEN_WRITE) + +# MS-VBAL 5.2.3.5 External Procedure Declaration +RE_DECLARE_LIB = r'(?:\bDeclare\b[^\n]+\bLib\b)' + +re_execute = re.compile(r'(?i)\b(?:Shell|CreateObject|GetObject|SendKeys|' + + r'MacScript|FollowHyperlink|CreateThread|ShellExecute)\b|' + RE_DECLARE_LIB) + +# short tag to display file types in triage mode: +TYPE2TAG = { + olevba.TYPE_OLE: 'OLE', + olevba.TYPE_OpenXML: 'OpX', + olevba.TYPE_Word2003_XML: 'XML', + olevba.TYPE_MHTML: 'MHT', + olevba.TYPE_TEXT: 'TXT', +} + + +# === CLASSES ================================================================= + +class Result_NoMacro(object): + exit_code = 0 + color = 'green' + name = 'No Macro' + + +class Result_NotMSOffice(object): + exit_code = 1 + color = 'green' + name = 'Not MS Office' + + +class Result_MacroOK(object): + exit_code = 2 + color = 'cyan' + name = 'Macro OK' + + +class Result_Error(object): + exit_code = 10 + color = 'yellow' + name = 'ERROR' + + +class Result_Suspicious(object): + exit_code = 20 + color = 'red' + name = 'SUSPICIOUS' + + +class MacroRaptor(object): + """ + class to scan VBA macro code to detect if it is malicious + """ + def __init__(self, vba_code): + """ + MacroRaptor constructor + :param vba_code: string containing the VBA macro code + """ + # collapse long lines first + self.vba_code = olevba.vba_collapse_long_lines(vba_code) + self.autoexec = False + self.write = False + self.execute = False + self.flags = '' + self.suspicious = False + self.autoexec_match = None + self.write_match = None + self.execute_match = None + self.matches = [] + + def scan(self): + """ + Scan the VBA macro code to detect if it is malicious + :return: + """ + m = re_autoexec.search(self.vba_code) + if m is not None: + self.autoexec = True + self.autoexec_match = m.group() + self.matches.append(m.group()) + m = re_write.search(self.vba_code) + if m is not None: + self.write = True + self.write_match = m.group() + self.matches.append(m.group()) + m = re_execute.search(self.vba_code) + if m is not None: + self.execute = True + self.execute_match = m.group() + self.matches.append(m.group()) + if self.autoexec and (self.execute or self.write): + self.suspicious = True + + def get_flags(self): + flags = '' + flags += 'A' if self.autoexec else '-' + flags += 'W' if self.write else '-' + flags += 'X' if self.execute else '-' + return flags + + +# === MAIN ==================================================================== + +def main(): + """ + Main function, called when olevba is run from the command line + """ + global log + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + parser.add_option("-m", '--matches', action="store_true", dest="show_matches", + help='Show matched strings.') + + # TODO: add logfile option + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print(__doc__) + parser.print_help() + print('\nAn exit code is returned based on the analysis result:') + for result in (Result_NoMacro, Result_NotMSOffice, Result_MacroOK, Result_Error, Result_Suspicious): + print(' - %d: %s' % (result.exit_code, result.name)) + sys.exit() + + # print banner with version + print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__) + print('This is work in progress, please report issues at %s' % URL_ISSUES) + + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + + t = tablestream.TableStream(style=tablestream.TableStyleSlim, + header_row=['Result', 'Flags', 'Type', 'File'], + column_width=[10, 5, 4, 56]) + + exitcode = -1 + global_result = None + # TODO: handle errors in xglob, to continue processing the next files + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + full_name = '%s in %s' % (filename, container) if container else filename + # try: + # # Open the file + # if data is None: + # data = open(filename, 'rb').read() + # except: + # log.exception('Error when opening file %r' % full_name) + # continue + if isinstance(data, Exception): + result = Result_Error + t.write_row([result.name, '', '', full_name], + colors=[result.color, None, None, None]) + t.write_row(['', '', '', str(data)], + colors=[None, None, None, result.color]) + else: + filetype = '???' + try: + vba_parser = olevba.VBA_Parser(filename=filename, data=data, container=container) + filetype = TYPE2TAG[vba_parser.type] + except Exception as e: + # log.error('Error when parsing VBA macros from file %r' % full_name) + # TODO: distinguish actual errors from non-MSOffice files + result = Result_Error + t.write_row([result.name, '', filetype, full_name], + colors=[result.color, None, None, None]) + t.write_row(['', '', '', str(e)], + colors=[None, None, None, result.color]) + continue + if vba_parser.detect_vba_macros(): + vba_code_all_modules = '' + try: + for (subfilename, stream_path, vba_filename, vba_code) in vba_parser.extract_all_macros(): + vba_code_all_modules += vba_code + '\n' + except Exception as e: + # log.error('Error when parsing VBA macros from file %r' % full_name) + result = Result_Error + t.write_row([result.name, '', TYPE2TAG[vba_parser.type], full_name], + colors=[result.color, None, None, None]) + t.write_row(['', '', '', str(e)], + colors=[None, None, None, result.color]) + continue + mraptor = MacroRaptor(vba_code_all_modules) + mraptor.scan() + if mraptor.suspicious: + result = Result_Suspicious + else: + result = Result_MacroOK + t.write_row([result.name, mraptor.get_flags(), filetype, full_name], + colors=[result.color, None, None, None]) + if mraptor.matches and options.show_matches: + t.write_row(['', '', '', 'Matches: %r' % mraptor.matches]) + else: + result = Result_NoMacro + t.write_row([result.name, '', filetype, full_name], + colors=[result.color, None, None, None]) + if result.exit_code > exitcode: + global_result = result + exitcode = result.exit_code + + print('') + print('Flags: A=AutoExec, W=Write, X=Execute') + print('Exit code: %d - %s' % (exitcode, global_result.name)) + sys.exit(exitcode) + +if __name__ == '__main__': + main() + +# Soundtrack: "Dark Child" by Marlon Williams diff -Nru remnux-oletools-0.51a/remnux-oletools/olebrowse.py remnux-oletools-0.51a/remnux-oletools/olebrowse.py --- remnux-oletools-0.51a/remnux-oletools/olebrowse.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/olebrowse.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,142 @@ +#!/usr/bin/env python +""" +olebrowse.py + +A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to +view and extract individual data streams. + +Usage: olebrowse.py [file] + +olebrowse project website: http://www.decalage.info/python/olebrowse + +olebrowse is part of the python-oletools package: +http://www.decalage.info/python/oletools + +olebrowse is copyright (c) 2012-2015, Philippe Lagadec (http://www.decalage.info) +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +__version__ = '0.02' + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2012-09-17 v0.01 PL: - first version +# 2014-11-29 v0.02 PL: - use olefile instead of OleFileIO_PL + +#------------------------------------------------------------------------------ +# TODO: +# - menu option to open another file +# - menu option to display properties +# - menu option to run other oletools, external tools such as OfficeCat? +# - for a stream, display info: size, path, etc +# - stream info: magic, entropy, ... ? + +import optparse, sys, os +from thirdparty.easygui import easygui +import thirdparty.olefile as olefile +import ezhexviewer + +ABOUT = '~ About olebrowse' +QUIT = '~ Quit' + + +def about (): + """ + Display information about this tool + """ + easygui.textbox(title='About olebrowse', text=__doc__) + + +def browse_stream (ole, stream): + """ + Browse a stream (hex view or save to file) + """ + #print 'stream:', stream + while True: + msg ='Select an action for the stream "%s", or press Esc to exit' % repr(stream) + actions = [ + 'Hex view', +## 'Text view', +## 'Repr view', + 'Save stream to file', + '~ Back to main menu', + ] + action = easygui.choicebox(msg, title='olebrowse', choices=actions) + if action is None or 'Back' in action: + break + elif action.startswith('Hex'): + data = ole.openstream(stream).getvalue() + ezhexviewer.hexview_data(data, msg='Stream: %s' % stream, title='olebrowse') +## elif action.startswith('Text'): +## data = ole.openstream(stream).getvalue() +## easygui.codebox(title='Text view - %s' % stream, text=data) +## elif action.startswith('Repr'): +## data = ole.openstream(stream).getvalue() +## easygui.codebox(title='Repr view - %s' % stream, text=repr(data)) + elif action.startswith('Save'): + data = ole.openstream(stream).getvalue() + fname = easygui.filesavebox(default='stream.bin') + if fname is not None: + f = open(fname, 'wb') + f.write(data) + f.close() + easygui.msgbox('stream saved to file %s' % fname) + + + +def main(): + """ + Main function + """ + try: + filename = sys.argv[1] + except: + filename = easygui.fileopenbox() + try: + ole = olefile.OleFileIO(filename) + listdir = ole.listdir() + streams = [] + for direntry in listdir: + #print direntry + streams.append('/'.join(direntry)) + streams.append(ABOUT) + streams.append(QUIT) + stream = True + while stream is not None: + msg ="Select a stream, or press Esc to exit" + title = "olebrowse" + stream = easygui.choicebox(msg, title, streams) + if stream is None or stream == QUIT: + break + if stream == ABOUT: + about() + else: + browse_stream(ole, stream) + except: + easygui.exceptionbox() + + + + +if __name__ == '__main__': + main() diff -Nru remnux-oletools-0.51a/remnux-oletools/oledir.py remnux-oletools-0.51a/remnux-oletools/oledir.py --- remnux-oletools-0.51a/remnux-oletools/oledir.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/oledir.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,187 @@ +#!/usr/bin/env python +""" +oledir.py + +oledir parses OLE files to display technical information about its directory +entries, including deleted/orphan streams/storages and unused entries. + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +oledir is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================== + +# oledir is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2015-04-17 v0.01 PL: - first version +# 2015-04-21 v0.02 PL: - improved display with prettytable +# 2016-01-13 v0.03 PL: - replaced prettytable by tablestream, added colors +# 2016-07-20 v0.50 SL: - added Python 3 support +# 2016-08-09 PL: - fixed issue #77 (imports from thirdparty dir) + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: +# TODO: show FAT/MiniFAT +# TODO: show errors when reading streams + +# === IMPORTS ================================================================ + +import sys, os + +# add the thirdparty subfolder to sys.path (absolute+normalized path): +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) +# print('_thismodule_dir = %r' % _thismodule_dir) +# assumption: the thirdparty dir is a subfolder: +_thirdparty_dir = os.path.normpath(os.path.join(_thismodule_dir, 'thirdparty')) +# print('_thirdparty_dir = %r' % _thirdparty_dir) +if not _thirdparty_dir in sys.path: + sys.path.insert(0, _thirdparty_dir) + +import colorclass + +# On Windows, colorclass needs to be enabled: +if os.name == 'nt': + colorclass.Windows.enable(auto_colors=True) + +import olefile +from tablestream import tablestream + + +# === CONSTANTS ============================================================== + +STORAGE_NAMES = { + olefile.STGTY_EMPTY: 'Empty', + olefile.STGTY_STORAGE: 'Storage', + olefile.STGTY_STREAM: 'Stream', + olefile.STGTY_LOCKBYTES: 'ILockBytes', + olefile.STGTY_PROPERTY: 'IPropertyStorage', + olefile.STGTY_ROOT: 'Root', +} + +STORAGE_COLORS = { + olefile.STGTY_EMPTY: 'green', + olefile.STGTY_STORAGE: 'blue', + olefile.STGTY_STREAM: 'yellow', + olefile.STGTY_LOCKBYTES: 'magenta', + olefile.STGTY_PROPERTY: 'magenta', + olefile.STGTY_ROOT: 'cyan', +} + +STATUS_COLORS = { + 'unused': 'green', + '': 'yellow', + 'ORPHAN': 'red', +} + + +# === FUNCTIONS ============================================================== + +def sid_display(sid): + if sid == olefile.NOSTREAM: + return '-' # None + else: + return sid + + +# === MAIN =================================================================== + +def main(): + # print banner with version + print('oledir %s - http://decalage.info/python/oletools' % __version__) + + if os.name == 'nt': + colorclass.Windows.enable(auto_colors=True, reset_atexit=True) + + fname = sys.argv[1] + print('OLE directory entries in file %s:' % fname) + ole = olefile.OleFileIO(fname) + # ole.dumpdirectory() + + # t = prettytable.PrettyTable(('id', 'Status', 'Type', 'Name', 'Left', 'Right', 'Child', '1st Sect', 'Size')) + # t.align = 'l' + # t.max_width['id'] = 4 + # t.max_width['Status'] = 6 + # t.max_width['Type'] = 10 + # t.max_width['Name'] = 10 + # t.max_width['Left'] = 5 + # t.max_width['Right'] = 5 + # t.max_width['Child'] = 5 + # t.max_width['1st Sect'] = 8 + # t.max_width['Size'] = 6 + + table = tablestream.TableStream(column_width=[4, 6, 7, 22, 5, 5, 5, 8, 6], + header_row=('id', 'Status', 'Type', 'Name', 'Left', 'Right', 'Child', '1st Sect', 'Size'), + style=tablestream.TableStyleSlim) + + # TODO: read ALL the actual directory entries from the directory stream, because olefile does not! + # TODO: OR fix olefile! + # TODO: olefile should store or give access to the raw direntry data on demand + # TODO: oledir option to hexdump the raw direntries + # TODO: olefile should be less picky about incorrect directory structures + + for id in range(len(ole.direntries)): + d = ole.direntries[id] + if d is None: + # this direntry is not part of the tree: either unused or an orphan + d = ole._load_direntry(id) #ole.direntries[id] + # print('%03d: %s *** ORPHAN ***' % (id, d.name)) + if d.entry_type == olefile.STGTY_EMPTY: + status = 'unused' + else: + status = 'ORPHAN' + else: + # print('%03d: %s' % (id, d.name)) + status = '' + if d.name.startswith('\x00'): + # this may happen with unused entries, the name may be filled with zeroes + name = '' + else: + # handle non-printable chars using repr(), remove quotes: + name = repr(d.name)[1:-1] + left = sid_display(d.sid_left) + right = sid_display(d.sid_right) + child = sid_display(d.sid_child) + entry_type = STORAGE_NAMES.get(d.entry_type, 'Unknown') + etype_color = STORAGE_COLORS.get(d.entry_type, 'red') + status_color = STATUS_COLORS.get(status, 'red') + + # print(' type=%7s sid_left=%s sid_right=%s sid_child=%s' + # %(entry_type, left, right, child)) + # t.add_row((id, status, entry_type, name, left, right, child, hex(d.isectStart), d.size)) + table.write_row((id, status, entry_type, name, left, right, child, '%X' % d.isectStart, d.size), + colors=(None, status_color, etype_color, None, None, None, None, None, None)) + ole.close() + # print t + + +if __name__ == '__main__': + main() \ No newline at end of file diff -Nru remnux-oletools-0.51a/remnux-oletools/oleid.py remnux-oletools-0.51a/remnux-oletools/oleid.py --- remnux-oletools-0.51a/remnux-oletools/oleid.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/oleid.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,298 @@ +#!/usr/bin/env python +""" +oleid.py + +oleid is a script to analyze OLE files such as MS Office documents (e.g. Word, +Excel), to detect specific characteristics that could potentially indicate that +the file is suspicious or malicious, in terms of security (e.g. malware). +For example it can detect VBA macros, embedded Flash objects, fragmentation. +The results can be displayed or returned as XML for further processing. + +Usage: oleid.py + +oleid project website: http://www.decalage.info/python/oleid + +oleid is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================= + +# oleid is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import print_function + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2012-10-29 v0.01 PL: - first version +# 2014-11-29 v0.02 PL: - use olefile instead of OleFileIO_PL +# - improved usage display with -h +# 2014-11-30 v0.03 PL: - improved output with prettytable +# 2016-10-25 v0.50 PL: - fixed print and bytes strings for Python 3 + +__version__ = '0.50' + + +#------------------------------------------------------------------------------ +# TODO: +# + extract relevant metadata: codepage, author, application, timestamps, etc +# - detect RTF and OpenXML +# - fragmentation +# - OLE package +# - entropy +# - detect PE header? +# - detect NOPs? +# - list type of each object in object pool? +# - criticality for each indicator?: info, low, medium, high +# - support wildcards with glob? +# - verbose option +# - csv, xml output + + +#=== IMPORTS ================================================================= + +import optparse, sys, os, re, zlib, struct +import thirdparty.olefile as olefile +from thirdparty.prettytable import prettytable + + +#=== FUNCTIONS =============================================================== + +def detect_flash (data): + """ + Detect Flash objects (SWF files) within a binary string of data + return a list of (start_index, length, compressed) tuples, or [] if nothing + found. + + Code inspired from xxxswf.py by Alexander Hanel (but significantly reworked) + http://hooked-on-mnemonics.blogspot.nl/2011/12/xxxswfpy.html + """ + #TODO: report + found = [] + for match in re.finditer(b'CWS|FWS', data): + start = match.start() + if start+8 > len(data): + # header size larger than remaining data, this is not a SWF + continue + #TODO: one struct.unpack should be simpler + # Read Header + header = data[start:start+3] + # Read Version + ver = struct.unpack(' 20: + continue + # Read SWF Size + size = struct.unpack(' len(data) or size < 1024: + # declared size larger than remaining data, this is not a SWF + # or declared size too small for a usual SWF + continue + # Read SWF into buffer. If compressed read uncompressed size. + swf = data[start:start+size] + compressed = False + if b'CWS' in header: + compressed = True + # compressed SWF: data after header (8 bytes) until the end is + # compressed with zlib. Attempt to decompress it to check if it is + # valid + compressed_data = swf[8:] + try: + zlib.decompress(compressed_data) + except: + continue + # else we don't check anything at this stage, we only assume it is a + # valid SWF. So there might be false positives for uncompressed SWF. + found.append((start, size, compressed)) + #print 'Found SWF start=%x, length=%d' % (start, size) + return found + + +#=== CLASSES ================================================================= + +class Indicator (object): + + def __init__(self, _id, value=None, _type=bool, name=None, description=None): + self.id = _id + self.value = value + self.type = _type + self.name = name + if name == None: + self.name = _id + self.description = description + + +class OleID: + + def __init__(self, filename): + self.filename = filename + self.indicators = [] + + def check(self): + # check if it is actually an OLE file: + oleformat = Indicator('ole_format', True, name='OLE format') + self.indicators.append(oleformat) + if not olefile.isOleFile(self.filename): + oleformat.value = False + return self.indicators + # parse file: + self.ole = olefile.OleFileIO(self.filename) + # checks: + self.check_properties() + self.check_encrypted() + self.check_word() + self.check_excel() + self.check_powerpoint() + self.check_visio() + self.check_ObjectPool() + self.check_flash() + self.ole.close() + return self.indicators + + def check_properties (self): + suminfo = Indicator('has_suminfo', False, name='Has SummaryInformation stream') + self.indicators.append(suminfo) + appname = Indicator('appname', 'unknown', _type=str, name='Application name') + self.indicators.append(appname) + self.suminfo = {} + # check stream SummaryInformation + if self.ole.exists("\x05SummaryInformation"): + suminfo.value = True + self.suminfo = self.ole.getproperties("\x05SummaryInformation") + # check application name: + appname.value = self.suminfo.get(0x12, 'unknown') + + def check_encrypted (self): + # we keep the pointer to the indicator, can be modified by other checks: + self.encrypted = Indicator('encrypted', False, name='Encrypted') + self.indicators.append(self.encrypted) + # check if bit 1 of security field = 1: + # (this field may be missing for Powerpoint2000, for example) + if 0x13 in self.suminfo: + if self.suminfo[0x13] & 1: + self.encrypted.value = True + + def check_word (self): + word = Indicator('word', False, name='Word Document', + description='Contains a WordDocument stream, very likely to be a Microsoft Word Document.') + self.indicators.append(word) + self.macros = Indicator('vba_macros', False, name='VBA Macros') + self.indicators.append(self.macros) + if self.ole.exists('WordDocument'): + word.value = True + # check for Word-specific encryption flag: + s = self.ole.openstream(["WordDocument"]) + # pass header 10 bytes + s.read(10) + # read flag structure: + temp16 = struct.unpack("H", s.read(2))[0] + fEncrypted = (temp16 & 0x0100) >> 8 + if fEncrypted: + self.encrypted.value = True + s.close() + # check for VBA macros: + if self.ole.exists('Macros'): + self.macros.value = True + + def check_excel (self): + excel = Indicator('excel', False, name='Excel Workbook', + description='Contains a Workbook or Book stream, very likely to be a Microsoft Excel Workbook.') + self.indicators.append(excel) + #self.macros = Indicator('vba_macros', False, name='VBA Macros') + #self.indicators.append(self.macros) + if self.ole.exists('Workbook') or self.ole.exists('Book'): + excel.value = True + # check for VBA macros: + if self.ole.exists('_VBA_PROJECT_CUR'): + self.macros.value = True + + def check_powerpoint (self): + ppt = Indicator('ppt', False, name='PowerPoint Presentation', + description='Contains a PowerPoint Document stream, very likely to be a Microsoft PowerPoint Presentation.') + self.indicators.append(ppt) + if self.ole.exists('PowerPoint Document'): + ppt.value = True + + def check_visio (self): + visio = Indicator('visio', False, name='Visio Drawing', + description='Contains a VisioDocument stream, very likely to be a Microsoft Visio Drawing.') + self.indicators.append(visio) + if self.ole.exists('VisioDocument'): + visio.value = True + + def check_ObjectPool (self): + objpool = Indicator('ObjectPool', False, name='ObjectPool', + description='Contains an ObjectPool stream, very likely to contain embedded OLE objects or files.') + self.indicators.append(objpool) + if self.ole.exists('ObjectPool'): + objpool.value = True + + + def check_flash (self): + flash = Indicator('flash', 0, _type=int, name='Flash objects', + description='Number of embedded Flash objects (SWF files) detected in OLE streams. Not 100% accurate, there may be false positives.') + self.indicators.append(flash) + for stream in self.ole.listdir(): + data = self.ole.openstream(stream).read() + found = detect_flash(data) + # just add to the count of Flash objects: + flash.value += len(found) + #print stream, found + + +#=== MAIN ================================================================= + +def main(): + usage = 'usage: %prog [options] ' + parser = optparse.OptionParser(usage=__doc__ + '\n' + usage) +## parser.add_option('-o', '--ole', action='store_true', dest='ole', help='Parse an OLE file (e.g. Word, Excel) to look for SWF in each stream') + + (options, args) = parser.parse_args() + + # Print help if no argurments are passed + if len(args) == 0: + parser.print_help() + return + + for filename in args: + print('\nFilename:', filename) + oleid = OleID(filename) + indicators = oleid.check() + + #TODO: add description + #TODO: highlight suspicious indicators + t = prettytable.PrettyTable(['Indicator', 'Value']) + t.align = 'l' + t.max_width = 39 + #t.border = False + + for indicator in indicators: + #print '%s: %s' % (indicator.name, indicator.value) + t.add_row((indicator.name, indicator.value)) + + print(t) + +if __name__ == '__main__': + main() diff -Nru remnux-oletools-0.51a/remnux-oletools/olemap.py remnux-oletools-0.51a/remnux-oletools/olemap.py --- remnux-oletools-0.51a/remnux-oletools/olemap.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/olemap.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,124 @@ +#!/usr/bin/env python +""" +olemap + +olemap parses OLE files to display technical information about its structure. + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +olemap is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================== + +# olemap is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2015-11-01 v0.01 PL: - first version +# 2016-01-13 v0.02 PL: - improved display with tablestream, added colors +# 2016-07-20 v0.50 SL: - added Python 3 support +# 2016-09-05 PL: - added main entry point for setup.py + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: + +# === IMPORTS ================================================================ + +import sys +from thirdparty.olefile import olefile +from thirdparty.tablestream import tablestream + + + +def sid_display(sid): + if sid == olefile.NOSTREAM: + return None + else: + return sid + +STORAGE_NAMES = { + olefile.STGTY_EMPTY: 'Empty', + olefile.STGTY_STORAGE: 'Storage', + olefile.STGTY_STREAM: 'Stream', + olefile.STGTY_LOCKBYTES: 'ILockBytes', + olefile.STGTY_PROPERTY: 'IPropertyStorage', + olefile.STGTY_ROOT: 'Root', +} + +FAT_TYPES = { + olefile.FREESECT: "Free", + olefile.ENDOFCHAIN: "End of Chain", + olefile.FATSECT: "FAT Sector", + olefile.DIFSECT: "DIFAT Sector" + } + +FAT_COLORS = { + olefile.FREESECT: "green", + olefile.ENDOFCHAIN: "yellow", + olefile.FATSECT: "cyan", + olefile.DIFSECT: "blue", + 'default': None, + } + + +# === MAIN =================================================================== + +def main(): + # print banner with version + print('olemap %s - http://decalage.info/python/oletools' % __version__) + + fname = sys.argv[1] + ole = olefile.OleFileIO(fname) + + print('FAT:') + t = tablestream.TableStream([8, 12, 8, 8], header_row=['Sector #', 'Type', 'Offset', 'Next #']) + for i in range(ole.nb_sect): + fat_value = ole.fat[i] + fat_type = FAT_TYPES.get(fat_value, '') + color_type = FAT_COLORS.get(fat_value, FAT_COLORS['default']) + # compute offset based on sector size: + offset = ole.sectorsize * (i+1) + # print '%8X: %-12s offset=%08X next=%8X' % (i, fat_type, 0, fat_value) + t.write_row(['%8X' % i, fat_type, '%08X' % offset, '%8X' % fat_value], + colors=[None, color_type, None, None]) + print('') + + print('MiniFAT:') + # load MiniFAT if it wasn't already done: + ole.loadminifat() + for i in range(len(ole.minifat)): + fat_value = ole.minifat[i] + fat_type = FAT_TYPES.get(fat_value, 'Data') + print('%8X: %-12s offset=%08X next=%8X' % (i, fat_type, 0, fat_value)) + + ole.close() + +if __name__ == '__main__': + main() diff -Nru remnux-oletools-0.51a/remnux-oletools/olemeta.py remnux-oletools-0.51a/remnux-oletools/olemeta.py --- remnux-oletools-0.51a/remnux-oletools/olemeta.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/olemeta.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,120 @@ +#!/usr/bin/env python +""" +olemeta.py + +olemeta is a script to parse OLE files such as MS Office documents (e.g. Word, +Excel), to extract all standard properties present in the OLE file. + +Usage: olemeta.py + +olemeta project website: http://www.decalage.info/python/olemeta + +olemeta is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================= + +# olemeta is copyright (c) 2013-2016, Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2013-07-24 v0.01 PL: - first version +# 2014-11-29 v0.02 PL: - use olefile instead of OleFileIO_PL +# - improved usage display +# 2015-12-29 v0.03 PL: - only display properties present in the file +# 2016-09-06 v0.50 PL: - added main entry point for setup.py +# 2016-10-25 PL: - fixed print for Python 3 +# 2016-10-28 PL: - removed the UTF8 codec for console display + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: +# + optparse +# + nicer output: table with fixed columns, datetime, etc +# + CSV output +# + option to only show available properties (by default) + +#=== IMPORTS ================================================================= + +import sys, codecs +import thirdparty.olefile as olefile +from thirdparty.tablestream import tablestream + + +#=== MAIN ================================================================= + +def main(): + try: + ole = olefile.OleFileIO(sys.argv[1]) + except IndexError: + sys.exit(__doc__) + + # parse and display metadata: + meta = ole.get_metadata() + + # console output with UTF8 encoding: + # It looks like we do not need the UTF8 codec anymore, both for Python 2 and 3 + console_utf8 = sys.stdout #codecs.getwriter('utf8')(sys.stdout) + + # TODO: move similar code to a function + + print('Properties from the SummaryInformation stream:') + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) + for prop in meta.SUMMARY_ATTRIBS: + value = getattr(meta, prop) + if value is not None: + # TODO: pretty printing for strings, dates, numbers + # TODO: better unicode handling + # print('- %s: %s' % (prop, value)) + # if isinstance(value, unicode): + # # encode to UTF8, avoiding errors + # value = value.encode('utf-8', errors='replace') + # else: + # value = str(value) + t.write_row([prop, value], colors=[None, 'yellow']) + t.close() + print('') + + print('Properties from the DocumentSummaryInformation stream:') + t = tablestream.TableStream([21, 30], header_row=['Property', 'Value'], outfile=console_utf8) + for prop in meta.DOCSUM_ATTRIBS: + value = getattr(meta, prop) + if value is not None: + # TODO: pretty printing for strings, dates, numbers + # TODO: better unicode handling + # print('- %s: %s' % (prop, value)) + # if isinstance(value, unicode): + # # encode to UTF8, avoiding errors + # value = value.encode('utf-8', errors='replace') + # else: + # value = str(value) + t.write_row([prop, value], colors=[None, 'yellow']) + t.close() + + ole.close() + +if __name__ == '__main__': + main() \ No newline at end of file diff -Nru remnux-oletools-0.51a/remnux-oletools/oleobj.py remnux-oletools-0.51a/remnux-oletools/oleobj.py --- remnux-oletools-0.51a/remnux-oletools/oleobj.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/oleobj.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,451 @@ +#!/usr/bin/env python +from __future__ import print_function +""" +oleobj.py + +oleobj is a Python script and module to parse OLE objects and files stored +into various file formats such as RTF or MS Office documents (e.g. Word, Excel). + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +oleobj is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +# === LICENSE ================================================================== + +# oleobj is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2015-12-05 v0.01 PL: - first version +# 2016-06 PL: - added main and process_file (not working yet) +# 2016-07-18 v0.48 SL: - added Python 3.5 support +# 2016-07-19 PL: - fixed Python 2.6-7 support + +__version__ = '0.48' + +#------------------------------------------------------------------------------ +# TODO: +# + setup logging (common with other oletools) + + +#------------------------------------------------------------------------------ +# REFERENCES: + +# Reference for the storage of embedded OLE objects/files: +# [MS-OLEDS]: Object Linking and Embedding (OLE) Data Structures +# https://msdn.microsoft.com/en-us/library/dd942265.aspx + +# - officeparser: https://github.com/unixfreak0037/officeparser +# TODO: oledump + + +#--- IMPORTS ------------------------------------------------------------------ + +import logging, struct, optparse, os, re, sys + +from thirdparty.olefile import olefile +from thirdparty.xglob import xglob + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + +# a global logger object used for debugging: +log = get_logger('oleobj') + + +# === CONSTANTS ============================================================== + +# some str methods on Python 2.x return characters, +# while the equivalent bytes methods return integers on Python 3.x: +if sys.version_info[0] <= 2: + # Python 2.x + NULL_CHAR = '\x00' +else: + # Python 3.x + NULL_CHAR = 0 + + +# === GLOBAL VARIABLES ======================================================= + +# struct to parse an unsigned integer of 32 bits: +struct_uint32 = struct.Struct(' FILETIME from olefile + self.unknown_long_1, data = read_uint32(data) + self.unknown_long_2, data = read_uint32(data) + # temp path? + self.temp_path, data = data.split(b'\x00', 1) + # size of the rest of the data + self.actual_size, data = read_uint32(data) + self.data = data[0:self.actual_size] + # TODO: exception when size > remaining data + # TODO: SLACK DATA + + +class OleObject (object): + """ + OLE 1.0 Object + + see MS-OLEDS 2.2 OLE1.0 Format Structures + """ + + # constants for the format_id attribute: + # see MS-OLEDS 2.2.4 ObjectHeader + TYPE_LINKED = 0x01 + TYPE_EMBEDDED = 0x02 + + + def __init__(self, bindata=None): + """ + Constructor for OleObject. + If bindata is provided, it will be parsed using the parse() method. + + :param bindata: bytes, OLE 1.0 Object structure containing an OLE object + """ + self.ole_version = None + self.format_id = None + self.class_name = None + self.topic_name = None + self.item_name = None + self.data = None + self.data_size = None + + def parse(self, data): + """ + Parse binary data containing an OLE 1.0 Object structure, + to extract the OLE object it contains. + (see MS-OLEDS 2.2 OLE1.0 Format Structures) + + :param data: bytes, OLE 1.0 Object structure containing an OLE object + :return: + """ + # Header: see MS-OLEDS 2.2.4 ObjectHeader + self.ole_version, data = read_uint32(data) + self.format_id, data = read_uint32(data) + log.debug('OLE version=%08X - Format ID=%08X' % (self.ole_version, self.format_id)) + assert self.format_id in (self.TYPE_EMBEDDED, self.TYPE_LINKED) + self.class_name, data = read_LengthPrefixedAnsiString(data) + self.topic_name, data = read_LengthPrefixedAnsiString(data) + self.item_name, data = read_LengthPrefixedAnsiString(data) + log.debug('Class name=%r - Topic name=%r - Item name=%r' + % (self.class_name, self.topic_name, self.item_name)) + if self.format_id == self.TYPE_EMBEDDED: + # Embedded object: see MS-OLEDS 2.2.5 EmbeddedObject + #assert self.topic_name != '' and self.item_name != '' + self.data_size, data = read_uint32(data) + log.debug('Declared data size=%d - remaining size=%d' % (self.data_size, len(data))) + # TODO: handle incorrect size to avoid exception + self.data = data[:self.data_size] + assert len(self.data) == self.data_size + self.extra_data = data[self.data_size:] + + + +def sanitize_filename(filename, replacement='_', max_length=200): + """compute basename of filename. Replaces all non-whitelisted characters. + The returned filename is always a basename of the file.""" + basepath = os.path.basename(filename).strip() + sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) + + while ".." in sane_fname: + sane_fname = sane_fname.replace('..', '.') + + while " " in sane_fname: + sane_fname = sane_fname.replace(' ', ' ') + + if not len(filename): + sane_fname = 'NONAME' + + # limit filename length + if max_length: + sane_fname = sane_fname[:max_length] + + return sane_fname + + +def process_file(container, filename, data, output_dir=None): + if output_dir: + if not os.path.isdir(output_dir): + log.info('creating output directory %s' % output_dir) + os.mkdir(output_dir) + + fname_prefix = os.path.join(output_dir, + sanitize_filename(filename)) + else: + base_dir = os.path.dirname(filename) + sane_fname = sanitize_filename(filename) + fname_prefix = os.path.join(base_dir, sane_fname) + + # TODO: option to extract objects to files (false by default) + if data is None: + data = open(filename, 'rb').read() + print ('-'*79) + print ('File: %r - %d bytes' % (filename, len(data))) + ole = olefile.OleFileIO(data) + index = 1 + for stream in ole.listdir(): + objdata = ole.openstream(stream).read() + stream_path = '/'.join(stream) + log.debug('Checking stream %r' % stream_path) + obj = OleObject() + try: + obj.parse(objdata) + print('extract file embedded in OLE object from stream %r:' % stream_path) + print('format_id = %d' % obj.format_id) + print('class name = %r' % obj.class_name) + print('data size = %d' % obj.data_size) + # set a file extension according to the class name: + class_name = obj.class_name.lower() + if class_name.startswith('word'): + ext = 'doc' + elif class_name.startswith('package'): + ext = 'package' + else: + ext = 'bin' + + fname = '%s_object_%03d.%s' % (fname_prefix, index, ext) + print ('saving to file %s' % fname) + open(fname, 'wb').write(obj.data) + if obj.class_name.lower() == 'package': + print ('Parsing OLE Package') + opkg = OleNativeStream(bindata=obj.data) + print ('Filename = %r' % opkg.filename) + print ('Source path = %r' % opkg.src_path) + print ('Temp path = %r' % opkg.temp_path) + if opkg.filename: + fname = '%s_%s' % (fname_prefix, + sanitize_filename(opkg.filename)) + else: + fname = '%s_object_%03d.noname' % (fname_prefix, index) + print ('saving to file %s' % fname) + open(fname, 'wb').write(opkg.data) + index += 1 + except: + log.debug('*** Not an OLE 1.0 Object') + + + +#=== MAIN ================================================================= + +if __name__ == '__main__': + # print banner with version + print ('oleobj %s - http://decalage.info/oletools' % __version__) + print ('THIS IS WORK IN PROGRESS - Check updates regularly!') + print ('Please report any issue at https://github.com/decalage2/oletools/issues') + print ('') + + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = {'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + # parser.add_option('-o', '--outfile', dest='outfile', + # help='output file') + # parser.add_option('-c', '--csv', dest='csv', + # help='export results to a CSV file') + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-d", type="str", dest="output_dir", + help='use specified directory to output files.', default=None) + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print (__doc__) + parser.print_help() + sys.exit() + + # Setup logging to the console: + # here we use stdout instead of stderr by default, so that the output + # can be redirected properly. + logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout, + format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + + + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + process_file(container, filename, data, options.output_dir) + + diff -Nru remnux-oletools-0.51a/remnux-oletools/oletimes.py remnux-oletools-0.51a/remnux-oletools/oletimes.py --- remnux-oletools-0.51a/remnux-oletools/oletimes.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/oletimes.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,108 @@ +#!/usr/bin/env python +""" +oletimes.py + +oletimes is a script to parse OLE files such as MS Office documents (e.g. Word, +Excel), to extract creation and modification times of all streams and storages +in the OLE file. + +Usage: oletimes.py + +oletimes project website: http://www.decalage.info/python/oletimes + +oletimes is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================= + +# oletimes is copyright (c) 2013-2016, Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2013-07-24 v0.01 PL: - first version +# 2014-11-29 v0.02 PL: - use olefile instead of OleFileIO_PL +# - improved usage display +# 2014-11-30 v0.03 PL: - improved output with prettytable +# 2016-07-20 v0.50 SL: - added Python 3 support +# 2016-09-05 PL: - added main entry point for setup.py + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: +# + optparse +# + nicer output: table with fixed columns, datetime, etc +# + CSV output +# + option to only show available timestamps (by default?) + +#=== IMPORTS ================================================================= + +import sys, datetime +import thirdparty.olefile as olefile +from thirdparty.prettytable import prettytable + + +# === MAIN =================================================================== + +def main(): + # print banner with version + print('oletimes %s - http://decalage.info/python/oletools' % __version__) + + try: + ole = olefile.OleFileIO(sys.argv[1]) + except IndexError: + sys.exit(__doc__) + + def dt2str (dt): + """ + Convert a datetime object to a string for display, without microseconds + + :param dt: datetime.datetime object, or None + :return: str, or None + """ + if dt is None: + return None + dt = dt.replace(microsecond = 0) + return str(dt) + + t = prettytable.PrettyTable(['Stream/Storage name', 'Modification Time', 'Creation Time']) + t.align = 'l' + t.max_width = 26 + #t.border = False + + #print'- Root mtime=%s ctime=%s' % (ole.root.getmtime(), ole.root.getctime()) + t.add_row(('Root', dt2str(ole.root.getmtime()), dt2str(ole.root.getctime()))) + + for obj in ole.listdir(streams=True, storages=True): + #print '- %s: mtime=%s ctime=%s' % (repr('/'.join(obj)), ole.getmtime(obj), ole.getctime(obj)) + t.add_row((repr('/'.join(obj)), dt2str(ole.getmtime(obj)), dt2str(ole.getctime(obj)))) + + print(t) + + ole.close() + +if __name__ == '__main__': + main() diff -Nru remnux-oletools-0.51a/remnux-oletools/olevba3.py remnux-oletools-0.51a/remnux-oletools/olevba3.py --- remnux-oletools-0.51a/remnux-oletools/olevba3.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/olevba3.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,3384 @@ +#!/usr/bin/env python +""" +olevba.py + +olevba is a script to parse OLE and OpenXML files such as MS Office documents +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate +and analyze malicious macros. + +Supported formats: +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) +- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) +- Word 2003 XML (.xml) +- Word/Excel Single File Web Page / MHTML (.mht) + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +olevba is part of the python-oletools package: +http://www.decalage.info/python/oletools + +olevba is based on source code from officeparser by John William Davison +https://github.com/unixfreak0037/officeparser +""" + +# === LICENSE ================================================================== + +# olevba is copyright (c) 2014-2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# olevba contains modified source code from the officeparser project, published +# under the following MIT License (MIT): +# +# officeparser is copyright (c) 2014 John William Davison +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2014-08-05 v0.01 PL: - first version based on officeparser code +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser +# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats +# and to find the VBA project root anywhere in the file +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API +# - added detect_vba_macros +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes +# - detect auto-executable macros +# - ignore empty macros +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive +# 2014-12-15 v0.08 PL: - improved display for empty macros +# - added pattern extraction +# 2014-12-25 v0.09 PL: - added suspicious keywords detection +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file +# - uses xglob to scan several files with wildcards +# - option -r to recurse subdirectories +# - option -z to scan files in password-protected zips +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns +# - process_file: improved display, shows container file +# - improved list of executable file extensions +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding +# - fixed issue #2, decoding VBA stream names using +# specified codepage and unicode stream names +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") +# - added several suspicious keywords +# - added option -i to analyze VBA source code directly +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions +# - added scan_vba to run all detection algorithms +# - decoded hex strings are now also scanned + reversed +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex +# strings and StrReverse +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding +# - improved display, shows obfuscation name +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename +# - added Base64 obfuscation decoding (contribution from +# @JamesHabben) +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and +# Dridex strings +# - exception handling in detect_base64_strings +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display +# - display exceptions with stack trace +# - added several suspicious keywords +# - improved Base64 detection and decoding +# - fixed triage mode not to scan attrib lines +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML +# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and +# virtualisation detection +# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros +# (issue #10 reported by Greg from SpamStopsHere) +# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header +# (issue #11 reported by Thomas Chopitea) +# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account +# various data offsets (issue #12) +# - improved detection of MSO files, avoiding incorrect +# parsing errors (issue #7) +# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit, +# Davy Douhine (issue #9), issue #13 +# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc) +# 2015-06-19 PL: - added options -a, -c, --each, --attr +# 2015-06-21 v0.32 PL: - always display decoded strings which are printable +# - fix VBA_Scanner.scan to return raw strings, not repr() +# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues +# 2015-07-12 PL: - added Hex function decoding to VBA Parser +# 2015-07-13 PL: - added Base64 function decoding to VBA Parser +# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions +# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI +# - fixed issue when analysis was done twice +# 2015-09-15 PL: - remove duplicate IOCs from results +# 2015-09-16 PL: - join long VBA lines ending with underscore before scan +# - disabled unused option --each +# 2015-09-22 v0.41 PL: - added new option --reveal +# - added suspicious strings for PowerShell.exe options +# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method +# 2015-10-10 PL: - added support for text files with VBA source code +# 2015-11-17 PL: - fixed bug with --decode option +# 2015-12-16 PL: - fixed bug in main (no options input anymore) +# - improved logging, added -l option +# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht +# - fixed issue #32 by monkeypatching email.feedparser +# 2016-02-07 PL: - KeyboardInterrupt is now raised properly +# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr +# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords +# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis +# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck) +# 2016-03-16 CH: - added option --no-deobfuscate (temporary) +# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate +# - updated suspicious keywords +# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans +# 2016-04-28 CH: - return an exit code depending on the results +# - improved error and exception handling +# - improved JSON output +# 2016-05-12 CH: - added support for PowerPoint 97-2003 files +# 2016-06-06 CH: - improved handling of unicode VBA module names +# 2016-06-07 CH: - added option --relaxed, stricter parsing by default +# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code +# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6 +# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding) +# 2016-10-25 PL: - fixed regex bytes strings (PR/issue #100) + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: +# + setup logging (common with other oletools) +# + add xor bruteforcing like bbharvest +# + options -a and -c should imply -d + +# TODO later: +# + performance improvement: instead of searching each keyword separately, +# first split vba code into a list of words (per line), then check each +# word against a dict. (or put vba words into a set/dict?) +# + for regex, maybe combine them into a single re with named groups? +# + add Yara support, include sample rules? plugins like balbuzard? +# + add balbuzard support +# + output to file (replace print by file.write, sys.stdout by default) +# + look for VBA in embedded documents (e.g. Excel in Word) +# + support SRP streams (see Lenny's article + links and sample) +# - python 3.x support +# - check VBA macros in Visio, Access, Project, etc +# - extract_macros: convert to a class, split long function into smaller methods +# - extract_macros: read bytes from stream file objects instead of strings +# - extract_macros: use combined struct.unpack instead of many calls +# - all except clauses should target specific exceptions + +#------------------------------------------------------------------------------ +# REFERENCES: +# - [MS-OVBA]: Microsoft Office VBA File Format Structure +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx +# - officeparser: https://github.com/unixfreak0037/officeparser + + +#--- IMPORTS ------------------------------------------------------------------ + +import sys, logging +import struct +from _io import StringIO,BytesIO +import math +import zipfile +import re +import optparse +import binascii +import base64 +import zlib +import email # for MHTML parsing +import string # for printable +import json # for json output mode (argument --json) + +# import lxml or ElementTree for XML parsing: +try: + # lxml: best performance for XML processing + import lxml.etree as ET +except ImportError: + try: + # Python 2.5+: batteries included + import xml.etree.cElementTree as ET + except ImportError: + try: + # Python <2.5: standalone ElementTree install + import elementtree.cElementTree as ET + except ImportError: + raise(ImportError, "lxml or ElementTree are not installed, " \ + + "see http://codespeak.net/lxml " \ + + "or http://effbot.org/zone/element-index.htm") + +import oletools.thirdparty.olefile as olefile +from oletools.thirdparty.prettytable import prettytable +from oletools.thirdparty.xglob import xglob, PathNotFoundException +from oletools.thirdparty.pyparsing.pyparsing import \ + CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \ + Optional, QuotedString,Regex, Suppress, Word, WordStart, \ + alphanums, alphas, hexnums,nums, opAssoc, srange, \ + infixNotation +import oletools.ppt_parser as ppt_parser + +# monkeypatch email to fix issue #32: +# allow header lines without ":" +import email.feedparser +email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])') + + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + +# a global logger object used for debugging: +log = get_logger('olevba') + + +#=== EXCEPTIONS ============================================================== + +class OlevbaBaseException(Exception): + """ Base class for exceptions produced here for simpler except clauses """ + def __init__(self, msg, filename=None, orig_exc=None, **kwargs): + if orig_exc: + super(OlevbaBaseException, self).__init__(msg + + ' ({0})'.format(orig_exc), + **kwargs) + else: + super(OlevbaBaseException, self).__init__(msg, **kwargs) + self.msg = msg + self.filename = filename + self.orig_exc = orig_exc + + +class FileOpenError(OlevbaBaseException): + """ raised by VBA_Parser constructor if all open_... attempts failed + + probably means the file type is not supported + """ + + def __init__(self, filename, orig_exc=None): + super(FileOpenError, self).__init__( + 'Failed to open file %s' % filename, filename, orig_exc) + + +class ProcessingError(OlevbaBaseException): + """ raised by VBA_Parser.process_file* functions """ + + def __init__(self, filename, orig_exc): + super(ProcessingError, self).__init__( + 'Error processing file %s' % filename, filename, orig_exc) + + +class MsoExtractionError(RuntimeError, OlevbaBaseException): + """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """ + + def __init__(self, msg): + MsoExtractionError.__init__(self, msg) + OlevbaBaseException.__init__(self, msg) + + +class SubstreamOpenError(FileOpenError): + """ special kind of FileOpenError: file is a substream of original file """ + + def __init__(self, filename, subfilename, orig_exc=None): + super(SubstreamOpenError, self).__init__( + str(filename) + '/' + str(subfilename), orig_exc) + self.filename = filename # overwrite setting in OlevbaBaseException + self.subfilename = subfilename + + +class UnexpectedDataError(OlevbaBaseException): + """ raised when parsing is strict (=not relaxed) and data is unexpected """ + + def __init__(self, stream_path, variable, expected, value): + super(UnexpectedDataError, self).__init__( + 'Unexpected value in {0} for variable {1}: ' + 'expected {2:04X} but found {3:04X}!' + .format(stream_path, variable, expected, value)) + self.stream_path = stream_path + self.variable = variable + self.expected = expected + self.value = value + +#--- CONSTANTS ---------------------------------------------------------------- + +# return codes +RETURN_OK = 0 +RETURN_WARNINGS = 1 # (reserved, not used yet) +RETURN_WRONG_ARGS = 2 # (fixed, built into optparse) +RETURN_FILE_NOT_FOUND = 3 +RETURN_XGLOB_ERR = 4 +RETURN_OPEN_ERROR = 5 +RETURN_PARSE_ERROR = 6 +RETURN_SEVERAL_ERRS = 7 +RETURN_UNEXPECTED = 8 + +# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) +MAC_CODEPAGES = { + 10000: 'mac-roman', + 10001: 'shiftjis', # not found: 'mac-shift-jis', + 10003: 'ascii', # nothing appropriate found: 'mac-hangul', + 10008: 'gb2321', # not found: 'mac-gb2312', + 10002: 'big5', # not found: 'mac-big5', + 10005: 'hebrew', # not found: 'mac-hebrew', + 10004: 'mac-arabic', + 10006: 'mac-greek', + 10081: 'mac-turkish', + 10021: 'thai', # not found: mac-thai', + 10029: 'maccentraleurope', # not found: 'mac-east europe', + 10007: 'ascii', # nothing appropriate found: 'mac-russian', +} + +# URL and message to report issues: +URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues' +MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES + +# Container types: +TYPE_OLE = 'OLE' +TYPE_OpenXML = 'OpenXML' +TYPE_Word2003_XML = 'Word2003_XML' +TYPE_MHTML = 'MHTML' +TYPE_TEXT = 'Text' +TYPE_PPT = 'PPT' + +# short tag to display file types in triage mode: +TYPE2TAG = { + TYPE_OLE: 'OLE:', + TYPE_OpenXML: 'OpX:', + TYPE_Word2003_XML: 'XML:', + TYPE_MHTML: 'MHT:', + TYPE_TEXT: 'TXT:', + TYPE_PPT: 'PPT', +} + + +# MSO files ActiveMime header magic +MSO_ACTIVEMIME_HEADER = b'ActiveMime' + +MODULE_EXTENSION = "bas" +CLASS_EXTENSION = "cls" +FORM_EXTENSION = "frm" + +# Namespaces and tags for Word2003 XML parsing: +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' +# the tag contains the VBA macro code: +TAG_BINDATA = NS_W + 'binData' +ATTR_NAME = NS_W + 'name' + +# Keywords to detect auto-executable macros +AUTOEXEC_KEYWORDS = { + # MS Word: + 'Runs when the Word document is opened': + ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'), + 'Runs when the Word document is closed': + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), + 'Runs when the Word document is modified': + ('DocumentChange',), + 'Runs when a new Word document is created': + ('AutoNew', 'Document_New', 'NewDocument'), + + # MS Excel: + 'Runs when the Excel Workbook is opened': + ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'), + 'Runs when the Excel Workbook is closed': + ('Auto_Close', 'Workbook_Close'), + + #TODO: full list in MS specs?? +} + +# Suspicious Keywords that may be used by malware +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx +SUSPICIOUS_KEYWORDS = { + #TODO: use regex to support variable whitespaces + 'May read system environment variables': + ('Environ',), + 'May open a file': + ('Open',), + 'May write to a file (if combined with Open)': + #TODO: regex to find Open+Write on same line + ('Write', 'Put', 'Output', 'Print #'), + 'May read or write a binary file (if combined with Open)': + #TODO: regex to find Open+Binary on same line + ('Binary',), + 'May copy a file': + ('FileCopy', 'CopyFile'), + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx + 'May delete a file': + ('Kill',), + 'May create a text file': + ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'), + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 + 'May run an executable file or a system command': + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'), + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 + 'May run PowerShell commands': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc + # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/ + # TODO: add support for keywords starting with a non-alpha character, such as "-noexit" + # TODO: '-command', '-EncodedCommand', '-scriptblock' + ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand', + 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'), + 'May run an executable file or a system command using PowerShell': + ('Start-Process',), + 'May hide the application': + ('Application.Visible', 'ShowWindow', 'SW_HIDE'), + 'May create a directory': + ('MkDir',), + 'May save the current workbook': + ('ActiveWorkbook.SaveAs',), + 'May change which directory contains files to open at startup': + #TODO: confirm the actual effect + ('Application.AltStartupPath',), + 'May create an OLE object': + ('CreateObject',), + 'May create an OLE object using PowerShell': + ('New-Object',), + 'May run an application (if combined with CreateObject)': + ('Shell.Application',), + 'May enumerate application windows (if combined with Shell.Application object)': + ('Windows', 'FindWindow'), + 'May run code from a DLL': + #TODO: regex to find declare+lib on same line + ('Lib',), + 'May inject code into another process': + ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload + ), + 'May download files from the Internet': + #TODO: regex to find urlmon+URLDownloadToFileA on same line + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP', + 'MSXML2.ServerXMLHTTP', # suggested in issue #13 + 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z + ), + 'May download files from the Internet using PowerShell': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + ('Net.WebClient', 'DownloadFile', 'DownloadString'), + 'May control another application by simulating user keystrokes': + ('SendKeys', 'AppActivate'), + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx + 'May attempt to obfuscate malicious function calls': + ('CallByName',), + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx + 'May attempt to obfuscate specific strings': + #TODO: regex to find several Chr*, not just one + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx + 'May read or write registry keys': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'), + 'May read registry keys': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + ('RegQueryValueExA', 'RegQueryValueEx', + 'RegRead', #with Wscript.Shell + ), + 'May detect virtualization': + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'), + 'May detect Anubis Sandbox': + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA + # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf + ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll + '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId', + '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller' + ), + 'May detect Sandboxie': + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('SbieDll.dll', 'SandboxieControlWndClass'), + 'May detect Sunbelt Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + (r'C:\file.exe',), + 'May detect Norman Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('currentuser',), + 'May detect CW Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('Schmidti',), + 'May detect WinJail Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('Afx:400000:0',), + 'Memory manipulation': + ('VirtualAllocEx', 'RtlMoveMemory'), +} + +# Regular Expression for a URL: +# http://en.wikipedia.org/wiki/Uniform_resource_locator +# http://www.w3.org/Addressing/URL/uri-spec.html +#TODO: also support username:password@server +#TODO: other protocols (file, gopher, wais, ...?) +SCHEME = r'\b(?:http|ftp)s?' +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' +#TODO: IPv6 - see https://www.debuggex.com/ +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' +IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255 +# IPv4 must come before the DNS name because it is more specific +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' +PORT = r'(?:\:[0-9]{1,5})?' +SERVER_PORT = SERVER + PORT +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH +re_url = re.compile(URL_RE) + + +# Patterns to be extracted (IP addresses, URLs, etc) +# From patterns.py in balbuzard +RE_PATTERNS = ( + ('URL', re.compile(URL_RE)), + ('IPv4 address', re.compile(IPv4)), + # TODO: add IPv6 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')), + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(? char +vba_chr = Suppress( + Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr') + + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$')) + + '(') + vba_expr_int + Suppress(')') + +def vba_chr_tostr(t): + try: + i = t[0] + # normal, non-unicode character: + if i>=0 and i<=255: + return VbaExpressionString(chr(i)) + else: + return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace')) + except ValueError: + log.exception('ERROR: incorrect parameter value for chr(): %r' % i) + return VbaExpressionString('Chr(%r)' % i) + +vba_chr.setParseAction(vba_chr_tostr) + + +# --- ASC -------------------------------------------------------------------- + +# Asc(char) => int +#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW +vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')') +vba_asc.setParseAction(lambda t: ord(t[0])) + + +# --- VAL -------------------------------------------------------------------- + +# Val(string) => int +# TODO: make sure the behavior of VBA's val is fully covered +vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')') +vba_val.setParseAction(lambda t: int(t[0].strip())) + + +# --- StrReverse() -------------------------------------------------------------------- + +# StrReverse(string) => string +strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')') +strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1])) + + +# --- ENVIRON() -------------------------------------------------------------------- + +# Environ("name") => just translated to "%name%", that is enough for malware analysis +environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')') +environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0])) + + +# --- IDENTIFIER ------------------------------------------------------------- + +#TODO: see MS-VBAL 3.3.5 page 33 +# 3.3.5 Identifier Tokens +# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character +# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z +# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore +latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_') + +# --- HEX FUNCTION ----------------------------------------------------------- + +# match any custom function name with a hex string as argument: +# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime + +# quoted string of at least two hexadecimal numbers of two digits: +quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"') +quoted_hex_string.setParseAction(lambda t: str(t[0])) + +hex_function_call = Suppress(latin_identifier) + Suppress('(') + \ + quoted_hex_string('hex_string') + Suppress(')') +hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string))) + + +# --- BASE64 FUNCTION ----------------------------------------------------------- + +# match any custom function name with a Base64 string as argument: +# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime + +# quoted string of at least two hexadecimal numbers of two digits: +quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"') +quoted_base64_string.setParseAction(lambda t: str(t[0])) + +base64_function_call = Suppress(latin_identifier) + Suppress('(') + \ + quoted_base64_string('base64_string') + Suppress(')') +base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string))) + + +# ---STRING EXPRESSION ------------------------------------------------------- + +def concat_strings_list(tokens): + """ + parse action to concatenate strings in a VBA expression with operators '+' or '&' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...] + strings = tokens[0][::2] + return VbaExpressionString(''.join(strings)) + + +vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call) + +vba_expr_str <<= infixNotation(vba_expr_str_item, + [ + ("+", 2, opAssoc.LEFT, concat_strings_list), + ("&", 2, opAssoc.LEFT, concat_strings_list), + ]) + + +# --- INTEGER EXPRESSION ------------------------------------------------------- + +def sum_ints_list(tokens): + """ + parse action to sum integers in a VBA expression with operator '+' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return sum(integers) + + +def subtract_ints_list(tokens): + """ + parse action to subtract integers in a VBA expression with operator '-' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return reduce(lambda x,y:x-y, integers) + + +def multiply_ints_list(tokens): + """ + parse action to multiply integers in a VBA expression with operator '*' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return reduce(lambda x,y:x*y, integers) + + +def divide_ints_list(tokens): + """ + parse action to divide integers in a VBA expression with operator '/' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return reduce(lambda x,y:x/y, integers) + + +vba_expr_int_item = (vba_asc | vba_val | integer) + +# operators associativity: +# https://en.wikipedia.org/wiki/Operator_associativity + +vba_expr_int <<= infixNotation(vba_expr_int_item, + [ + ("*", 2, opAssoc.LEFT, multiply_ints_list), + ("/", 2, opAssoc.LEFT, divide_ints_list), + ("-", 2, opAssoc.LEFT, subtract_ints_list), + ("+", 2, opAssoc.LEFT, sum_ints_list), + ]) + + +# see detect_vba_strings for the deobfuscation code using this grammar + +# === MSO/ActiveMime files parsing =========================================== + +def is_mso_file(data): + """ + Check if the provided data is the content of a MSO/ActiveMime file, such as + the ones created by Outlook in some cases, or Word/Excel when saving a + file with the MHTML format or the Word 2003 XML format. + This function only checks the ActiveMime magic at the beginning of data. + :param data: bytes string, MSO/ActiveMime file content + :return: bool, True if the file is MSO, False otherwise + """ + return data.startswith(MSO_ACTIVEMIME_HEADER) + + +# regex to find zlib block headers, starting with byte 0x78 = 'x' +re_zlib_header = re.compile(r'x') + + +def mso_file_extract(data): + """ + Extract the data stored into a MSO/ActiveMime file, such as + the ones created by Outlook in some cases, or Word/Excel when saving a + file with the MHTML format or the Word 2003 XML format. + + :param data: bytes string, MSO/ActiveMime file content + :return: bytes string, extracted data (uncompressed) + + raise a MsoExtractionError if the data cannot be extracted + """ + # check the magic: + assert is_mso_file(data) + + # In all the samples seen so far, Word always uses an offset of 0x32, + # and Excel 0x22A. But we read the offset from the header to be more + # generic. + offsets = [0x32, 0x22A] + + # First, attempt to get the compressed data offset from the header + # According to my tests, it should be an unsigned 16 bits integer, + # at offset 0x1E (little endian) + add 46: + try: + offset = struct.unpack_from('> bit_count + offset_mask = ~length_mask + maximum_length = (0xFFFF >> bit_count) + 3 + return length_mask, offset_mask, bit_count, maximum_length + + +def decompress_stream(compressed_container): + """ + Decompress a stream according to MS-OVBA section 2.4.1 + + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm + return the decompressed container as a string (bytes) + """ + # 2.4.1.2 State Variables + + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by + # decompression or to be written by compression. + + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the + # CompressedContainer (section 2.4.1.1.1). + + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by + # decompression or to be read by compression. + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). + + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the + # DecompressedBuffer (section 2.4.1.1.2). + + decompressed_container = b'' # result + compressed_current = 0 + + sig_byte = compressed_container[compressed_current] + if sig_byte != 0x01: + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) + + compressed_current += 1 + + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that + # CompressedRecordEnd = len(compressed_container) + while compressed_current < len(compressed_container): + # 2.4.1.1.5 + compressed_chunk_start = compressed_current + # chunk header = first 16 bits + compressed_chunk_header = \ + struct.unpack("> 12) & 0x07 + if chunk_signature != 0b011: + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') + # chunk flag = next bit - 1 == compressed, 0 == uncompressed + chunk_flag = (compressed_chunk_header >> 15) & 0x01 + log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) + + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) + # The minimum size is 3 bytes + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value + # in chunk header before adding 3. + # Also the first test is not useful since a 12 bits value cannot be larger than 4095. + if chunk_flag == 1 and chunk_size > 4098: + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') + if chunk_flag == 0 and chunk_size != 4098: + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') + + # check if chunk_size goes beyond the compressed data, instead of silently cutting it: + #TODO: raise an exception? + if compressed_chunk_start + chunk_size > len(compressed_container): + log.warning('Chunk size is larger than remaining compressed data') + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) + # read after chunk header: + compressed_current = compressed_chunk_start + 2 + + if chunk_flag == 0: + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk + # uncompressed chunk: read the next 4096 bytes as-is + #TODO: check if there are at least 4096 bytes left + decompressed_container += bytes([compressed_container[compressed_current:compressed_current + 4096]]) + compressed_current += 4096 + else: + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk + # compressed chunk + decompressed_chunk_start = len(decompressed_container) + while compressed_current < compressed_end: + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence + # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or + # copy tokens (reference to a previous literal token) + flag_byte = compressed_container[compressed_current] + compressed_current += 1 + for bit_index in range(0, 8): + # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) + if compressed_current >= compressed_end: + break + # MS-OVBA 2.4.1.3.5 Decompressing a Token + # MS-OVBA 2.4.1.3.17 Extract FlagBit + flag_bit = (flag_byte >> bit_index) & 1 + #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) + if flag_bit == 0: # LiteralToken + # copy one byte directly to output + decompressed_container += bytes([compressed_container[compressed_current]]) + compressed_current += 1 + else: # CopyToken + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken + copy_token = \ + struct.unpack("> temp2) + 1 + #log.debug('offset=%d length=%d' % (offset, length)) + copy_source = len(decompressed_container) - offset + for index in range(copy_source, copy_source + length): + decompressed_container += bytes([decompressed_container[index]]) + compressed_current += 2 + return decompressed_container + + +def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): + """ + Extract VBA macros from an OleFileIO object. + Internal function, do not call directly. + + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream + vba_project: path to the PROJECT stream + :param relaxed: If True, only create info/debug log entry if data is not as expected + (e.g. opening substream fails); if False, raise an error in this case + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream + """ + # Open the PROJECT stream: + project = ole.openstream(project_path) + log.debug('relaxed is %s' % relaxed) + + # sample content of the PROJECT stream: + + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" + ## Document=ThisDocument/&H00000000 + ## Module=NewMacros + ## Name="Project" + ## HelpContextID="0" + ## VersionCompatible32="393222000" + ## CMG="F1F301E705E705E705E705" + ## DPB="8F8D7FE3831F2020202020" + ## GC="2D2FDD81E51EE61EE6E1" + ## + ## [Host Extender Info] + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 + ## + ## [Workspace] + ## ThisDocument=22, 29, 339, 477, Z + ## NewMacros=-4, 42, 832, 510, C + + code_modules = {} + + for line in project: + line = line.strip().decode('utf-8','ignore') + if '=' in line: + # split line at the 1st equal sign: + name, value = line.split('=', 1) + # looking for code modules + # add the code module as a key in the dictionary + # the value will be the extension needed later + # The value is converted to lowercase, to allow case-insensitive matching (issue #3) + value = value.lower() + if name == 'Document': + # split value at the 1st slash, keep 1st part: + value = value.split('/', 1)[0] + code_modules[value] = CLASS_EXTENSION + elif name == 'Module': + code_modules[value] = MODULE_EXTENSION + elif name == 'Class': + code_modules[value] = CLASS_EXTENSION + elif name == 'BaseClass': + code_modules[value] = FORM_EXTENSION + + # read data from dir stream (compressed) + dir_compressed = ole.openstream(dir_path).read() + + def check_value(name, expected, value): + if expected != value: + if relaxed: + log.error("invalid value for {0} expected {1:04X} got {2:04X}" + .format(name, expected, value)) + else: + raise UnexpectedDataError(dir_path, name, expected, value) + + dir_stream = BytesIO(decompress_stream(dir_compressed)) + + # PROJECTSYSKIND Record + projectsyskind_id = struct.unpack(" 128: + log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname)) + projectname_projectname = dir_stream.read(projectname_sizeof_projectname) + unused = projectname_projectname + + # PROJECTDOCSTRING Record + projectdocstring_id = struct.unpack(" 2000: + log.error( + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) + projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring) + projectdocstring_reserved = struct.unpack(" 260: + log.error( + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) + projecthelpfilepath_reserved = struct.unpack(" 1015: + log.error( + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) + projectconstants_reserved = struct.unpack(" 0: + code_data = decompress_stream(code_data) + # case-insensitive search in the code_modules dict to find the file extension: + filext = code_modules.get(modulename_modulename.lower(), 'bin') + filename = '{0}.{1}'.format(modulename_modulename, filext) + #TODO: also yield the codepage so that callers can decode it properly + yield (code_path, filename, code_data) + # print '-'*79 + # print filename + # print '' + # print code_data + # print '' + log.debug('extracted file {0}'.format(filename)) + else: + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) + except (UnexpectedDataError, SubstreamOpenError): + raise + except Exception as exc: + log.info('Error parsing module {0} of {1} in _extract_vba:' + .format(projectmodule_index, projectmodules_count), + exc_info=True) + if not relaxed: + raise + _ = unused # make pylint happy: now variable "unused" is being used ;-) + return + + +def vba_collapse_long_lines(vba_code): + """ + Parse a VBA module code to detect continuation line characters (underscore) and + collapse split lines. Continuation line characters are replaced by spaces. + + :param vba_code: str, VBA module code + :return: str, VBA module code with long lines collapsed + """ + # TODO: use a regex instead, to allow whitespaces after the underscore? + vba_code = vba_code.replace(' _\r\n', ' ') + vba_code = vba_code.replace(' _\r', ' ') + vba_code = vba_code.replace(' _\n', ' ') + return vba_code + + +def filter_vba(vba_code): + """ + Filter VBA source code to remove the first lines starting with "Attribute VB_", + which are automatically added by MS Office and not displayed in the VBA Editor. + This should only be used when displaying source code for human analysis. + + Note: lines are not filtered if they contain a colon, because it could be + used to hide malicious instructions. + + :param vba_code: str, VBA source code + :return: str, filtered VBA source code + """ + vba_lines = vba_code.splitlines() + start = 0 + for line in vba_lines: + if line.startswith("Attribute VB_") and not ':' in line: + start += 1 + else: + break + #TODO: also remove empty lines? + vba = '\n'.join(vba_lines[start:]) + return vba + + +def detect_autoexec(vba_code, obfuscation=None): + """ + Detect if the VBA code contains keywords corresponding to macros running + automatically when triggered by specific actions (e.g. when a document is + opened or closed). + + :param vba_code: str, VBA source code + :param obfuscation: None or str, name of obfuscation to be added to description + :return: list of str tuples (keyword, description) + """ + #TODO: merge code with detect_suspicious + # case-insensitive search + #vba_code = vba_code.lower() + results = [] + obf_text = '' + if obfuscation: + obf_text = ' (obfuscation: %s)' % obfuscation + for description, keywords in AUTOEXEC_KEYWORDS.items(): + for keyword in keywords: + #TODO: if keyword is already a compiled regex, use it as-is + # search using regex to detect word boundaries: + if re.search(r'(?i)\b' + keyword + r'\b', vba_code): + #if keyword.lower() in vba_code: + results.append((keyword, description + obf_text)) + return results + + +def detect_suspicious(vba_code, obfuscation=None): + """ + Detect if the VBA code contains suspicious keywords corresponding to + potential malware behaviour. + + :param vba_code: str, VBA source code + :param obfuscation: None or str, name of obfuscation to be added to description + :return: list of str tuples (keyword, description) + """ + # case-insensitive search + #vba_code = vba_code.lower() + results = [] + obf_text = '' + if obfuscation: + obf_text = ' (obfuscation: %s)' % obfuscation + for description, keywords in SUSPICIOUS_KEYWORDS.items(): + for keyword in keywords: + # search using regex to detect word boundaries: + if re.search(r'(?i)\b' + keyword + r'\b', vba_code): + #if keyword.lower() in vba_code: + results.append((keyword, description + obf_text)) + return results + + +def detect_patterns(vba_code, obfuscation=None): + """ + Detect if the VBA code contains specific patterns such as IP addresses, + URLs, e-mail addresses, executable file names, etc. + + :param vba_code: str, VBA source code + :return: list of str tuples (pattern type, value) + """ + results = [] + found = set() + obf_text = '' + if obfuscation: + obf_text = ' (obfuscation: %s)' % obfuscation + for pattern_type, pattern_re in RE_PATTERNS: + for match in pattern_re.finditer(vba_code): + value = match.group() + if value not in found: + results.append((pattern_type + obf_text, value)) + found.add(value) + return results + + +def detect_hex_strings(vba_code): + """ + Detect if the VBA code contains strings encoded in hexadecimal. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + results = [] + found = set() + for match in re_hex_string.finditer(vba_code): + value = match.group() + if value not in found: + decoded = binascii.unhexlify(value) + results.append((value, decoded.decode('utf-8','replace'))) + found.add(value) + return results + + +def detect_base64_strings(vba_code): + """ + Detect if the VBA code contains strings encoded in base64. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + #TODO: avoid matching simple hex strings as base64? + results = [] + found = set() + for match in re_base64_string.finditer(vba_code): + # extract the base64 string without quotes: + value = match.group().strip('"') + # check it is not just a hex string: + if not re_nothex_check.search(value): + continue + # only keep new values and not in the whitelist: + if value not in found and value.lower() not in BASE64_WHITELIST: + try: + decoded = base64.b64decode(value) + results.append((value, decoded.decode('utf-8','replace'))) + found.add(value) + except (TypeError, ValueError) as exc: + log.debug('Failed to base64-decode (%s)' % exc) + # if an exception occurs, it is likely not a base64-encoded string + return results + + +def detect_dridex_strings(vba_code): + """ + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + from oletools.thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode + + results = [] + found = set() + for match in re_dridex_string.finditer(vba_code): + value = match.group()[1:-1] + # check it is not just a hex string: + if not re_nothex_check.search(value): + continue + if value not in found: + try: + decoded = DridexUrlDecode(value) + results.append((value, decoded)) + found.add(value) + except Exception as exc: + log.debug('Failed to Dridex-decode (%s)' % exc) + # if an exception occurs, it is likely not a dridex-encoded string + return results + + +def detect_vba_strings(vba_code): + """ + Detect if the VBA code contains strings obfuscated with VBA expressions + using keywords such as Chr, Asc, Val, StrReverse, etc. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + # TODO: handle exceptions + results = [] + found = set() + # IMPORTANT: to extract the actual VBA expressions found in the code, + # we must expand tabs to have the same string as pyparsing. + # Otherwise, start and end offsets are incorrect. + vba_code = vba_code.expandtabs() + for tokens, start, end in vba_expr_str.scanString(vba_code): + encoded = vba_code[start:end] + decoded = tokens[0] + if isinstance(decoded, VbaExpressionString): + # This is a VBA expression, not a simple string + # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded) + # remove parentheses and quotes from original string: + # if encoded.startswith('(') and encoded.endswith(')'): + # encoded = encoded[1:-1] + # if encoded.startswith('"') and encoded.endswith('"'): + # encoded = encoded[1:-1] + # avoid duplicates and simple strings: + if encoded not in found and decoded != encoded: + results.append((encoded, decoded)) + found.add(encoded) + # else: + # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded) + return results + + +def json2ascii(json_obj, encoding='utf8', errors='replace'): + """ ensure there is no unicode in json and all strings are safe to decode + + works recursively, decodes and re-encodes every string to/from unicode + to ensure there will be no trouble in loading the dumped json output + """ + if json_obj is None: + pass + elif isinstance(json_obj, (bool, int, float)): + pass + elif isinstance(json_obj, str): + # de-code and re-encode + dencoded = json_obj + if dencoded != json_obj: + log.debug('json2ascii: replaced: {0} (len {1})' + .format(json_obj, len(json_obj))) + log.debug('json2ascii: with: {0} (len {1})' + .format(dencoded, len(dencoded))) + return dencoded + elif isinstance(json_obj, bytes): + log.debug('json2ascii: encode unicode: {0}' + .format(json_obj.decode(encoding, errors))) + # cannot put original into logger + # print 'original: ' json_obj + return json_obj.decode(encoding, errors) + elif isinstance(json_obj, dict): + for key in json_obj: + json_obj[key] = json2ascii(json_obj[key]) + elif isinstance(json_obj, (list,tuple)): + for item in json_obj: + item = json2ascii(item) + else: + log.debug('unexpected type in json2ascii: {0} -- leave as is' + .format(type(json_obj))) + return json_obj + + +_have_printed_json_start = False + +def print_json(json_dict=None, _json_is_last=False, **json_parts): + """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1 + + can use in two ways: + (1) print_json(some_dict) + (2) print_json(key1=value1, key2=value2, ...) + + :param bool _json_is_last: set to True only for very last entry to complete + the top-level json-list + """ + global _have_printed_json_start + + if json_dict and json_parts: + raise ValueError('Invalid json argument: want either single dict or ' + 'key=value parts but got both)') + elif (json_dict is not None) and (not isinstance(json_dict, dict)): + raise ValueError('Invalid json argument: want either single dict or ' + 'key=value parts but got {0} instead of dict)' + .format(type(json_dict))) + if json_parts: + json_dict = json_parts + + if not _have_printed_json_start: + print('[') + _have_printed_json_start = True + + lines = json.dumps(json2ascii(json_dict), check_circular=False, + indent=4, ensure_ascii=False).splitlines() + for line in lines[:-1]: + print(' {0}'.format(line)) + if _json_is_last: + print(' {0}'.format(lines[-1])) # print last line without comma + print(']') + else: + print(' {0},'.format(lines[-1])) # print last line with comma + + +class VBA_Scanner(object): + """ + Class to scan the source code of a VBA module to find obfuscated strings, + suspicious keywords, IOCs, auto-executable macros, etc. + """ + + def __init__(self, vba_code): + """ + VBA_Scanner constructor + + :param vba_code: str, VBA source code to be analyzed + """ + # join long lines ending with " _": + self.code = vba_collapse_long_lines(vba_code) + self.code_hex = '' + self.code_hex_rev = '' + self.code_rev_hex = '' + self.code_base64 = '' + self.code_dridex = '' + self.code_vba = '' + self.strReverse = None + # results = None before scanning, then a list of tuples after scanning + self.results = None + self.autoexec_keywords = None + self.suspicious_keywords = None + self.iocs = None + self.hex_strings = None + self.base64_strings = None + self.dridex_strings = None + self.vba_strings = None + + + def scan(self, include_decoded_strings=False, deobfuscate=False): + """ + Analyze the provided VBA code to detect suspicious keywords, + auto-executable macros, IOC patterns, obfuscation patterns + such as hex-encoded strings. + + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + :return: list of tuples (type, keyword, description) + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') + """ + # First, detect and extract hex-encoded strings: + self.hex_strings = detect_hex_strings(self.code) + # detect if the code contains StrReverse: + self.strReverse = False + if 'strreverse' in self.code.lower(): self.strReverse = True + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: + for encoded, decoded in self.hex_strings: + self.code_hex += '\n' + decoded + # if the code contains "StrReverse", also append the hex strings in reverse order: + if self.strReverse: + # StrReverse after hex decoding: + self.code_hex_rev += '\n' + decoded[::-1] + # StrReverse before hex decoding: + self.code_rev_hex += '\n' + str(binascii.unhexlify(encoded[::-1])) + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ + #TODO: also append the full code reversed if StrReverse? (risk of false positives?) + # Detect Base64-encoded strings + self.base64_strings = detect_base64_strings(self.code) + for encoded, decoded in self.base64_strings: + self.code_base64 += '\n' + decoded + # Detect Dridex-encoded strings + self.dridex_strings = detect_dridex_strings(self.code) + for encoded, decoded in self.dridex_strings: + self.code_dridex += '\n' + decoded + # Detect obfuscated strings in VBA expressions + if deobfuscate: + self.vba_strings = detect_vba_strings(self.code) + else: + self.vba_strings = [] + for encoded, decoded in self.vba_strings: + self.code_vba += '\n' + decoded + results = [] + self.autoexec_keywords = [] + self.suspicious_keywords = [] + self.iocs = [] + + for code, obfuscation in ( + (self.code, None), + (self.code_hex, 'Hex'), + (self.code_hex_rev, 'Hex+StrReverse'), + (self.code_rev_hex, 'StrReverse+Hex'), + (self.code_base64, 'Base64'), + (self.code_dridex, 'Dridex'), + (self.code_vba, 'VBA expression'), + ): + if isinstance(code,bytes): + code=code.decode('utf-8','replace') + self.autoexec_keywords += detect_autoexec(code, obfuscation) + self.suspicious_keywords += detect_suspicious(code, obfuscation) + self.iocs += detect_patterns(code, obfuscation) + + # If hex-encoded strings were discovered, add an item to suspicious keywords: + if self.hex_strings: + self.suspicious_keywords.append(('Hex Strings', + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) + if self.base64_strings: + self.suspicious_keywords.append(('Base64 Strings', + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) + if self.dridex_strings: + self.suspicious_keywords.append(('Dridex Strings', + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) + if self.vba_strings: + self.suspicious_keywords.append(('VBA obfuscated Strings', + 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)')) + # use a set to avoid duplicate keywords + keyword_set = set() + for keyword, description in self.autoexec_keywords: + if keyword not in keyword_set: + results.append(('AutoExec', keyword, description)) + keyword_set.add(keyword) + keyword_set = set() + for keyword, description in self.suspicious_keywords: + if keyword not in keyword_set: + results.append(('Suspicious', keyword, description)) + keyword_set.add(keyword) + keyword_set = set() + for pattern_type, value in self.iocs: + if value not in keyword_set: + results.append(('IOC', value, pattern_type)) + keyword_set.add(value) + + # include decoded strings only if they are printable or if --decode option: + for encoded, decoded in self.hex_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('Hex String', decoded, encoded)) + for encoded, decoded in self.base64_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('Base64 String', decoded, encoded)) + for encoded, decoded in self.dridex_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('Dridex string', decoded, encoded)) + for encoded, decoded in self.vba_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('VBA string', decoded, encoded)) + self.results = results + return results + + def scan_summary(self): + """ + Analyze the provided VBA code to detect suspicious keywords, + auto-executable macros, IOC patterns, obfuscation patterns + such as hex-encoded strings. + + :return: tuple with the number of items found for each category: + (autoexec, suspicious, IOCs, hex, base64, dridex, vba) + """ + # avoid scanning the same code twice: + if self.results is None: + self.scan() + return (len(self.autoexec_keywords), len(self.suspicious_keywords), + len(self.iocs), len(self.hex_strings), len(self.base64_strings), + len(self.dridex_strings), len(self.vba_strings)) + + +def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): + """ + Analyze the provided VBA code to detect suspicious keywords, + auto-executable macros, IOC patterns, obfuscation patterns + such as hex-encoded strings. + (shortcut for VBA_Scanner(vba_code).scan()) + + :param vba_code: str, VBA source code to be analyzed + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + :return: list of tuples (type, keyword, description) + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') + """ + return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) + + +#=== CLASSES ================================================================= + +class VBA_Parser(object): + """ + Class to parse MS Office files, to detect VBA macros and extract VBA source code + Supported file formats: + - Word 97-2003 (.doc, .dot) + - Word 2007+ (.docm, .dotm) + - Word 2003 XML (.xml) + - Word MHT - Single File Web Page / MHTML (.mht) + - Excel 97-2003 (.xls) + - Excel 2007+ (.xlsm, .xlsb) + - PowerPoint 97-2003 (.ppt) + - PowerPoint 2007+ (.pptm, .ppsm) + """ + + def __init__(self, filename, data=None, container=None, relaxed=False): + """ + Constructor for VBA_Parser + + :param filename: filename or path of file to parse, or file-like object + + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). + If data is provided as a bytes string, it will be parsed as the content of the file in memory, + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). + + :param container: str, path and filename of container if the file is within + a zip archive, None otherwise. + + :param relaxed: if True, treat mal-formed documents and missing streams more like MS office: + do nothing; if False (default), raise errors in these cases + + raises a FileOpenError if all attemps to interpret the data header failed + """ + #TODO: filename should only be a string, data should be used for the file-like object + #TODO: filename should be mandatory, optional data is a string or file-like object + #TODO: also support olefile and zipfile as input + if data is None: + # open file from disk: + _file = filename + else: + # file already read in memory, make it a file-like object for zipfile: + _file = BytesIO(data) + #self.file = _file + self.ole_file = None + self.ole_subfiles = [] + self.filename = filename + self.container = container + self.relaxed = relaxed + self.type = None + self.vba_projects = None + self.vba_forms = None + self.contains_macros = None # will be set to True or False by detect_macros + self.vba_code_all_modules = None # to store the source code of all modules + # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) + self.modules = None + # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner + self.analysis_results = None + # statistics for the scan summary and flags + self.nb_macros = 0 + self.nb_autoexec = 0 + self.nb_suspicious = 0 + self.nb_iocs = 0 + self.nb_hexstrings = 0 + self.nb_base64strings = 0 + self.nb_dridexstrings = 0 + self.nb_vbastrings = 0 + + # if filename is None: + # if isinstance(_file, basestring): + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: + # self.filename = _file + # else: + # self.filename = '' + # else: + # self.filename = '' + if olefile.isOleFile(_file): + # This looks like an OLE file + self.open_ole(_file) + + # if this worked, try whether it is a ppt file (special ole file) + self.open_ppt() + if self.type is None and zipfile.is_zipfile(_file): + # Zip file, which may be an OpenXML document + self.open_openxml(_file) + if self.type is None: + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, + # or a plain text file containing VBA code + if data is None: + data = open(filename, 'rb').read() + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace + if b'http://schemas.microsoft.com/office/word/2003/wordml' in data: + self.open_word2003xml(data) + # store a lowercase version for the next tests: + data_lowercase = data.lower() + # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): + # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line + # BUT Word accepts a blank line or other MIME headers inserted before, + # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored. + # And the line is case insensitive. + # so we'll just check the presence of mime, version and multipart anywhere: + if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \ + and b'multipart' in data_lowercase: + self.open_mht(data) + #TODO: handle exceptions + #TODO: Excel 2003 XML + # Check if this is a plain text VBA or VBScript file: + # To avoid scanning binary files, we simply check for some control chars: + if self.type is None and b'\x00' not in data: + self.open_text(data) + if self.type is None: + # At this stage, could not match a known format: + msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename + log.info(msg) + raise FileOpenError(msg) + + def open_ole(self, _file): + """ + Open an OLE file + :param _file: filename or file contents in a file object + :return: nothing + """ + log.info('Opening OLE file %s' % self.filename) + try: + # Open and parse the OLE file, using unicode for path names: + self.ole_file = olefile.OleFileIO(_file, path_encoding=None) + # set type only if parsing succeeds + self.type = TYPE_OLE + except (IOError, TypeError, ValueError) as exc: + # TODO: handle OLE parsing exceptions + log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + + + def open_openxml(self, _file): + """ + Open an OpenXML file + :param _file: filename or file contents in a file object + :return: nothing + """ + # This looks like a zip file, need to look for vbaProject.bin inside + # It can be any OLE file inside the archive + #...because vbaProject.bin can be renamed: + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 + log.info('Opening ZIP/OpenXML file %s' % self.filename) + try: + z = zipfile.ZipFile(_file) + #TODO: check if this is actually an OpenXML file + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically + # check each file within the zip if it is an OLE file, by reading its magic: + for subfile in z.namelist(): + magic = z.open(subfile).read(len(olefile.MAGIC)) + if magic == olefile.MAGIC: + log.debug('Opening OLE file %s within zip' % subfile) + ole_data = z.open(subfile).read() + try: + self.ole_subfiles.append( + VBA_Parser(filename=subfile, data=ole_data, + relaxed=self.relaxed)) + except OlevbaBaseException as exc: + if self.relaxed: + log.info('%s is not a valid OLE file (%s)' % (subfile, exc)) + log.debug('Trace:', exc_info=True) + continue + else: + raise SubstreamOpenError(self.filename, subfile, + exc) + z.close() + # set type only if parsing succeeds + self.type = TYPE_OpenXML + except OlevbaBaseException as exc: + if self.relaxed: + log.info('Error {0} caught in Zip/OpenXML parsing for file {1}' + .format(exc, self.filename)) + log.debug('Trace:', exc_info=True) + else: + raise + except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc: + # TODO: handle parsing exceptions + log.info('Failed Zip/OpenXML parsing for file %r (%s)' + % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + + def open_word2003xml(self, data): + """ + Open a Word 2003 XML file + :param data: file contents in a string or bytes + :return: nothing + """ + log.info('Opening Word 2003 XML file %s' % self.filename) + try: + # parse the XML content + # TODO: handle XML parsing exceptions + et = ET.fromstring(data) + # find all the binData elements: + for bindata in et.getiterator(TAG_BINDATA): + # the binData content is an OLE container for the VBA project, compressed + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. + # get the filename: + fname = bindata.get(ATTR_NAME, 'noname.mso') + # decode the base64 activemime + mso_data = binascii.a2b_base64(bindata.text) + if is_mso_file(mso_data): + # decompress the zlib data stored in the MSO file, which is the OLE container: + # TODO: handle different offsets => separate function + try: + ole_data = mso_file_extract(mso_data) + self.ole_subfiles.append( + VBA_Parser(filename=fname, data=ole_data, + relaxed=self.relaxed)) + except OlevbaBaseException as exc: + if self.relaxed: + log.info('Error parsing subfile {0}: {1}' + .format(fname, exc)) + log.debug('Trace:', exc_info=True) + else: + raise SubstreamOpenError(self.filename, fname, exc) + else: + log.info('%s is not a valid MSO file' % fname) + # set type only if parsing succeeds + self.type = TYPE_Word2003_XML + except OlevbaBaseException as exc: + if self.relaxed: + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + else: + raise + except Exception as exc: + # TODO: differentiate exceptions for each parsing stage + # (but ET is different libs, no good exception description in API) + # found: XMLSyntaxError + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + + def open_mht(self, data): + """ + Open a MHTML file + :param data: file contents in a string or bytes + :return: nothing + """ + log.info('Opening MHTML file %s' % self.filename) + try: + if isinstance(data,bytes): + data = data.decode('utf8', 'replace') + # parse the MIME content + # remove any leading whitespace or newline (workaround for issue in email package) + stripped_data = data.lstrip('\r\n\t ') + # strip any junk from the beginning of the file + # (issue #31 fix by Greg C - gdigreg) + # TODO: improve keywords to avoid false positives + mime_offset = stripped_data.find('MIME') + content_offset = stripped_data.find('Content') + # if "MIME" is found, and located before "Content": + if -1 < mime_offset <= content_offset: + stripped_data = stripped_data[mime_offset:] + # else if "Content" is found, and before "MIME" + # TODO: can it work without "MIME" at all? + elif content_offset > -1: + stripped_data = stripped_data[content_offset:] + # TODO: quick and dirty fix: insert a standard line with MIME-Version header? + mhtml = email.message_from_string(stripped_data) + # find all the attached files: + for part in mhtml.walk(): + content_type = part.get_content_type() # always returns a value + fname = part.get_filename(None) # returns None if it fails + # TODO: get content-location if no filename + log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type)) + part_data = part.get_payload(decode=True) + # VBA macros are stored in a binary file named "editdata.mso". + # the data content is an OLE container for the VBA project, compressed + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. + # decompress the zlib data starting at offset 0x32, which is the OLE container: + # check ActiveMime header: + + if (isinstance(part_data, str) or isinstance(part_data, bytes)) and is_mso_file(part_data): + log.debug('Found ActiveMime header, decompressing MSO container') + try: + ole_data = mso_file_extract(part_data) + + # TODO: check if it is actually an OLE file + # TODO: get the MSO filename from content_location? + self.ole_subfiles.append( + VBA_Parser(filename=fname, data=ole_data, + relaxed=self.relaxed)) + except OlevbaBaseException as exc: + if self.relaxed: + log.info('%s does not contain a valid OLE file (%s)' + % (fname, exc)) + log.debug('Trace:', exc_info=True) + # TODO: bug here - need to split in smaller functions/classes? + else: + raise SubstreamOpenError(self.filename, fname, exc) + else: + log.debug('type(part_data) = %s' % type(part_data)) + try: + log.debug('part_data[0:20] = %r' % part_data[0:20]) + except TypeError as err: + log.debug('part_data has no __getitem__') + # set type only if parsing succeeds + self.type = TYPE_MHTML + except OlevbaBaseException: + raise + except Exception: + log.info('Failed MIME parsing for file %r - %s' + % (self.filename, MSG_OLEVBA_ISSUES)) + log.debug('Trace:', exc_info=True) + + def open_ppt(self): + """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser + + Although self.ole_file is a valid olefile.OleFileIO, we set + self.ole_file = None in here and instead set self.ole_subfiles to the + VBA ole streams found within the main ole file. That makes most of the + code below treat this like an OpenXML file and only look at the + ole_subfiles (except find_vba_* which needs to explicitly check for + self.type) + """ + + log.info('Check whether OLE file is PPT') + ppt_parser.enable_logging() + try: + ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) + for vba_data in ppt.iter_vba_data(): + self.ole_subfiles.append(VBA_Parser(None, vba_data, + container='PptParser')) + log.info('File is PPT') + self.ole_file.close() # just in case + self.ole_file = None # required to make other methods look at ole_subfiles + self.type = TYPE_PPT + except Exception as exc: + if self.container == 'PptParser': + # this is a subfile of a ppt --> to be expected that is no ppt + log.debug('PPT subfile is not a PPT file') + else: + log.debug("File appears not to be a ppt file (%s)" % exc) + + + def open_text(self, data): + """ + Open a text file containing VBA or VBScript source code + :param data: file contents in a string or bytes + :return: nothing + """ + log.info('Opening text file %s' % self.filename) + # directly store the source code: + if isinstance(data,bytes): + data=data.decode('utf8','replace') + self.vba_code_all_modules = data + self.contains_macros = True + # set type only if parsing succeeds + self.type = TYPE_TEXT + + + def find_vba_projects(self): + """ + Finds all the VBA projects stored in an OLE file. + + Return None if the file is not OLE but OpenXML. + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. + vba_root is the path of the root OLE storage containing the VBA project, + including a trailing slash unless it is the root of the OLE file. + project_path is the path of the OLE stream named "PROJECT" within the VBA project. + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. + + If this function returns an empty list for one of the supported formats + (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros. + + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) + for each VBA project found if OLE file + """ + log.debug('VBA_Parser.find_vba_projects') + + # if the file is not OLE but OpenXML, return None: + if self.ole_file is None and self.type != TYPE_PPT: + return None + + # if this method has already been called, return previous result: + if self.vba_projects is not None: + return self.vba_projects + + # if this is a ppt file (PowerPoint 97-2003): + # self.ole_file is None but the ole_subfiles do contain vba_projects + # (like for OpenXML files). + if self.type == TYPE_PPT: + # TODO: so far, this function is never called for PPT files, but + # if that happens, the information is lost which ole file contains + # which storage! + log.warning('Returned info is not complete for PPT types!') + self.vba_projects = [] + for subfile in self.ole_subfiles: + self.vba_projects.extend(subfile.find_vba_projects()) + return self.vba_projects + + # Find the VBA project root (different in MS Word, Excel, etc): + # - Word 97-2003: Macros + # - Excel 97-2003: _VBA_PROJECT_CUR + # - PowerPoint 97-2003: PptParser has identified ole_subfiles + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word + # - Visio 2007: not supported yet (different file structure) + + # According to MS-OVBA section 2.2.1: + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream + # - all names are case-insensitive + + def check_vba_stream(ole, vba_root, stream_path): + full_path = vba_root + stream_path + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: + log.debug('Found %s stream: %s' % (stream_path, full_path)) + return full_path + else: + log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) + return False + + # start with an empty list: + self.vba_projects = [] + # Look for any storage containing those storage/streams: + ole = self.ole_file + for storage in ole.listdir(streams=False, storages=True): + log.debug('Checking storage %r' % storage) + # Look for a storage ending with "VBA": + if storage[-1].upper() == 'VBA': + log.debug('Found VBA storage: %s' % ('/'.join(storage))) + vba_root = '/'.join(storage[:-1]) + # Add a trailing slash to vba_root, unless it is the root of the OLE file: + # (used later to append all the child streams/storages) + if vba_root != '': + vba_root += '/' + log.debug('Checking vba_root="%s"' % vba_root) + + # Check if the VBA root storage also contains a PROJECT stream: + project_path = check_vba_stream(ole, vba_root, 'PROJECT') + if not project_path: continue + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') + if not vba_project_path: continue + # Check if the VBA root storage also contains a VBA/dir stream: + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') + if not dir_path: continue + # Now we are pretty sure it is a VBA project structure + log.debug('VBA root storage: "%s"' % vba_root) + # append the results to the list as a tuple for later use: + self.vba_projects.append((vba_root, project_path, dir_path)) + return self.vba_projects + + def detect_vba_macros(self): + """ + Detect the potential presence of VBA macros in the file, by checking + if it contains VBA projects. Both OLE and OpenXML files are supported. + + Important: for now, results are accurate only for Word, Excel and PowerPoint + + Note: this method does NOT attempt to check the actual presence or validity + of VBA macro source code, so there might be false positives. + It may also detect VBA macros in files embedded within the main file, + for example an Excel workbook with macros embedded into a Word + document without macros may be detected, without distinction. + + :return: bool, True if at least one VBA project has been found, False otherwise + """ + #TODO: return None or raise exception if format not supported + #TODO: return the number of VBA projects found instead of True/False? + # if this method was already called, return the previous result: + if self.contains_macros is not None: + return self.contains_macros + # if OpenXML/PPT, check all the OLE subfiles: + if self.ole_file is None: + for ole_subfile in self.ole_subfiles: + if ole_subfile.detect_vba_macros(): + self.contains_macros = True + return True + # otherwise, no macro found: + self.contains_macros = False + return False + # otherwise it's an OLE file, find VBA projects: + vba_projects = self.find_vba_projects() + if len(vba_projects) == 0: + self.contains_macros = False + else: + self.contains_macros = True + # Also look for VBA code in any stream including orphans + # (happens in some malformed files) + ole = self.ole_file + for sid in range(len(ole.direntries)): + # check if id is already done above: + log.debug('Checking DirEntry #%d' % sid) + d = ole.direntries[sid] + if d is None: + # this direntry is not part of the tree: either unused or an orphan + d = ole._load_direntry(sid) + log.debug('This DirEntry is an orphan or unused') + if d.entry_type == olefile.STGTY_STREAM: + # read data + log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size)) + try: + data = ole._open(d.isectStart, d.size).read() + log.debug('Read %d bytes' % len(data)) + if len(data) > 200: + log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) + else: + log.debug(repr(data)) + if 'Attribut' in data.decode('utf-8','ignore'): + log.debug('Found VBA compressed code') + self.contains_macros = True + except IOError as exc: + if self.relaxed: + log.info('Error when reading OLE Stream %r' % d.name) + log.debug('Trace:', exc_trace=True) + else: + raise SubstreamOpenError(self.filename, d.name, exc) + return self.contains_macros + + def extract_macros(self): + """ + Extract and decompress source code for each VBA macro found in the file + + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found + If the file is OLE, filename is the path of the file. + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros + within the zip archive, e.g. word/vbaProject.bin. + If the file is PPT, result is as for OpenXML but filename is useless + """ + log.debug('extract_macros:') + if self.ole_file is None: + # This may be either an OpenXML/PPT or a text file: + if self.type == TYPE_TEXT: + # This is a text file, yield the full code: + yield (self.filename, '', self.filename, self.vba_code_all_modules) + else: + # OpenXML/PPT: recursively yield results from each OLE subfile: + for ole_subfile in self.ole_subfiles: + for results in ole_subfile.extract_macros(): + yield results + else: + # This is an OLE file: + self.find_vba_projects() + # set of stream ids + vba_stream_ids = set() + for vba_root, project_path, dir_path in self.vba_projects: + # extract all VBA macros from that VBA root storage: + for stream_path, vba_filename, vba_code in \ + _extract_vba(self.ole_file, vba_root, project_path, + dir_path, self.relaxed): + # store direntry ids in a set: + vba_stream_ids.add(self.ole_file._find(stream_path)) + yield (self.filename, stream_path, vba_filename, vba_code) + # Also look for VBA code in any stream including orphans + # (happens in some malformed files) + ole = self.ole_file + for sid in range(len(ole.direntries)): + # check if id is already done above: + log.debug('Checking DirEntry #%d' % sid) + if sid in vba_stream_ids: + log.debug('Already extracted') + continue + d = ole.direntries[sid] + if d is None: + # this direntry is not part of the tree: either unused or an orphan + d = ole._load_direntry(sid) + log.debug('This DirEntry is an orphan or unused') + if d.entry_type == olefile.STGTY_STREAM: + # read data + log.debug('Reading data from stream %r' % d.name) + data = ole._open(d.isectStart, d.size).read() + for match in re.finditer(b'\\x00Attribut[^e]', data, flags=re.IGNORECASE): + start = match.start() - 3 + log.debug('Found VBA compressed code at index %X' % start) + compressed_code = data[start:] + try: + vba_code = decompress_stream(compressed_code) + yield (self.filename, d.name, d.name, vba_code) + except Exception as exc: + # display the exception with full stack trace for debugging + log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc)) + log.debug('Traceback:', exc_info=True) + # do not raise the error, as it is unlikely to be a compressed macro stream + + def extract_all_macros(self): + """ + Extract and decompress source code for each VBA macro found in the file + by calling extract_macros(), store the results as a list of tuples + (filename, stream_path, vba_filename, vba_code) in self.modules. + See extract_macros for details. + """ + if self.modules is None: + self.modules = [] + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros(): + self.modules.append((subfilename, stream_path, vba_filename, vba_code)) + self.nb_macros = len(self.modules) + return self.modules + + + + def analyze_macros(self, show_decoded_strings=False, deobfuscate=False): + """ + runs extract_macros and analyze the source code of all VBA macros + found in the file. + """ + if self.detect_vba_macros(): + # if the analysis was already done, avoid doing it twice: + if self.analysis_results is not None: + return self.analysis_results + # variable to merge source code from all modules: + if self.vba_code_all_modules is None: + self.vba_code_all_modules = '' + for (_, _, _, vba_code) in self.extract_all_macros(): + #TODO: filter code? (each module) + self.vba_code_all_modules += vba_code.decode('utf-8', 'ignore') + '\n' + for (_, _, form_string) in self.extract_form_strings(): + self.vba_code_all_modules += form_string.decode('utf-8', 'ignore') + '\n' + # Analyze the whole code at once: + scanner = VBA_Scanner(self.vba_code_all_modules) + self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) + autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary() + self.nb_autoexec += autoexec + self.nb_suspicious += suspicious + self.nb_iocs += iocs + self.nb_hexstrings += hexstrings + self.nb_base64strings += base64strings + self.nb_dridexstrings += dridex + self.nb_vbastrings += vbastrings + + return self.analysis_results + + + def reveal(self): + # we only want printable strings: + analysis = self.analyze_macros(show_decoded_strings=False) + # to avoid replacing short strings contained into longer strings, we sort the analysis results + # based on the length of the encoded string, in reverse order: + analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True) + # normally now self.vba_code_all_modules contains source code from all modules + deobf_code = self.vba_code_all_modules + for kw_type, decoded, encoded in analysis: + if kw_type == 'VBA string': + #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded) + # need to add double quotes around the decoded strings + # after escaping double-quotes as double-double-quotes for VBA: + decoded = decoded.replace('"', '""') + deobf_code = deobf_code.replace(encoded, '"%s"' % decoded) + return deobf_code + #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees + + + def find_vba_forms(self): + """ + Finds all the VBA forms stored in an OLE file. + + Return None if the file is not OLE but OpenXML. + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. + vba_root is the path of the root OLE storage containing the VBA project, + including a trailing slash unless it is the root of the OLE file. + project_path is the path of the OLE stream named "PROJECT" within the VBA project. + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. + + If this function returns an empty list for one of the supported formats + (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms. + + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) + for each VBA project found if OLE file + """ + log.debug('VBA_Parser.find_vba_forms') + + # if the file is not OLE but OpenXML, return None: + if self.ole_file is None and self.type != TYPE_PPT: + return None + + # if this method has already been called, return previous result: + # if self.vba_projects is not None: + # return self.vba_projects + + # According to MS-OFORMS section 2.1.2 Control Streams: + # - A parent control, that is, a control that can contain embedded controls, + # MUST be persisted as a storage that contains multiple streams. + # - All parent controls MUST contain a FormControl. The FormControl + # properties are persisted to a stream (1) as specified in section 2.1.1.2. + # The name of this stream (1) MUST be "f". + # - Embedded controls that cannot themselves contain other embedded + # controls are persisted sequentially as FormEmbeddedActiveXControls + # to a stream (1) contained in the same storage as the parent control. + # The name of this stream (1) MUST be "o". + # - all names are case-insensitive + + if self.type == TYPE_PPT: + # TODO: so far, this function is never called for PPT files, but + # if that happens, the information is lost which ole file contains + # which storage! + ole_files = self.ole_subfiles + log.warning('Returned info is not complete for PPT types!') + else: + ole_files = [self.ole_file, ] + + # start with an empty list: + self.vba_forms = [] + + # Loop over ole streams + for ole in ole_files: + # Look for any storage containing those storage/streams: + for storage in ole.listdir(streams=False, storages=True): + log.debug('Checking storage %r' % storage) + # Look for two streams named 'o' and 'f': + o_stream = storage + ['o'] + f_stream = storage + ['f'] + log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream)) + if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \ + and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM: + form_path = '/'.join(storage) + log.debug('Found VBA Form: %r' % form_path) + self.vba_forms.append(storage) + return self.vba_forms + + def extract_form_strings(self): + """ + Extract printable strings from each VBA Form found in the file + + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found + If the file is OLE, filename is the path of the file. + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros + within the zip archive, e.g. word/vbaProject.bin. + If the file is PPT, result is as for OpenXML but filename is useless + """ + if self.ole_file is None: + # This may be either an OpenXML/PPT or a text file: + if self.type == TYPE_TEXT: + # This is a text file, return no results: + return + else: + # OpenXML/PPT: recursively yield results from each OLE subfile: + for ole_subfile in self.ole_subfiles: + for results in ole_subfile.extract_form_strings(): + yield results + else: + # This is an OLE file: + self.find_vba_forms() + ole = self.ole_file + for form_storage in self.vba_forms: + o_stream = form_storage + ['o'] + log.debug('Opening form object stream %r' % '/'.join(o_stream)) + form_data = ole.openstream(o_stream).read() + # Extract printable strings from the form object stream "o": + for m in re_printable_string.finditer(form_data): + log.debug('Printable string found in form: %r' % m.group()) + yield (self.filename, '/'.join(o_stream), m.group()) + + + def close(self): + """ + Close all the open files. This method must be called after usage, if + the application is opening many files. + """ + if self.ole_file is None: + if self.ole_subfiles is not None: + for ole_subfile in self.ole_subfiles: + ole_subfile.close() + else: + self.ole_file.close() + + + +class VBA_Parser_CLI(VBA_Parser): + """ + VBA parser and analyzer, adding methods for the command line interface + of olevba. (see VBA_Parser) + """ + + def __init__(self, *args, **kwargs): + """ + Constructor for VBA_Parser_CLI. + Calls __init__ from VBA_Parser with all arguments --> see doc there + """ + super(VBA_Parser_CLI, self).__init__(*args, **kwargs) + + + def print_analysis(self, show_decoded_strings=False, deobfuscate=False): + """ + Analyze the provided VBA code, and print the results in a table + + :param vba_code: str, VBA source code to be analyzed + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + :return: None + """ + # print a waiting message only if the output is not redirected to a file: + if sys.stdout.isatty(): + print('Analysis...\r') + sys.stdout.flush() + results = self.analyze_macros(show_decoded_strings, deobfuscate) + if results: + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) + t.align = 'l' + t.max_width['Type'] = 10 + t.max_width['Keyword'] = 20 + t.max_width['Description'] = 39 + for kw_type, keyword, description in results: + # handle non printable strings: + if not is_printable(keyword): + keyword = repr(keyword) + if not is_printable(description): + description = repr(description) + t.add_row((kw_type, keyword, description)) + print(t) + else: + print('No suspicious keyword or IOC found.') + + def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False): + """ + Analyze the provided VBA code, and return the results in json format + + :param vba_code: str, VBA source code to be analyzed + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + + :return: dict + """ + # print a waiting message only if the output is not redirected to a file: + if sys.stdout.isatty(): + print('Analysis...\r') + sys.stdout.flush() + return [dict(type=kw_type, keyword=keyword, description=description) + for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)] + + def process_file(self, show_decoded_strings=False, + display_code=True, hide_attributes=True, + vba_code_only=False, show_deobfuscated_code=False, + deobfuscate=False): + """ + Process a single file + + :param filename: str, path and filename of file on disk, or within the container. + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param display_code: bool, if False VBA source code is not displayed (default True) + :param global_analysis: bool, if True all modules are merged for a single analysis (default), + otherwise each module is analyzed separately (old behaviour) + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + """ + #TODO: replace print by writing to a provided output file (sys.stdout by default) + # fix conflicting parameters: + if vba_code_only and not display_code: + display_code = True + if self.container: + display_filename = '%s in %s' % (self.filename, self.container) + else: + display_filename = self.filename + print('=' * 79) + print('FILE:', display_filename) + try: + #TODO: handle olefile errors, when an OLE file is malformed + print('Type: %s' % self.type) + if self.detect_vba_macros(): + #print 'Contains VBA Macros:' + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): + if hide_attributes: + # hide attribute lines: + if isinstance(vba_code,bytes): + vba_code =vba_code.decode('utf-8','replace') + vba_code_filtered = filter_vba(vba_code) + else: + vba_code_filtered = vba_code + print('-' * 79) + print('VBA MACRO %s ' % vba_filename) + print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))) + if display_code: + print('- ' * 39) + # detect empty macros: + if vba_code_filtered.strip() == '': + print('(empty macro)') + else: + print(vba_code_filtered) + for (subfilename, stream_path, form_string) in self.extract_form_strings(): + print('-' * 79) + print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)) + print('- ' * 39) + print(form_string.decode('utf-8', 'ignore')) + if not vba_code_only: + # analyse the code from all modules at once: + self.print_analysis(show_decoded_strings, deobfuscate) + if show_deobfuscated_code: + print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n') + print(self.reveal()) + else: + print('No VBA macros found.') + except OlevbaBaseException: + raise + except Exception as exc: + # display the exception with full stack trace for debugging + log.info('Error processing file %s (%s)' % (self.filename, exc)) + log.debug('Traceback:', exc_info=True) + raise ProcessingError(self.filename, exc) + print('') + + + def process_file_json(self, show_decoded_strings=False, + display_code=True, hide_attributes=True, + vba_code_only=False, show_deobfuscated_code=False, + deobfuscate=False): + """ + Process a single file + + every "show" or "print" here is to be translated as "add to json" + + :param filename: str, path and filename of file on disk, or within the container. + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param display_code: bool, if False VBA source code is not displayed (default True) + :param global_analysis: bool, if True all modules are merged for a single analysis (default), + otherwise each module is analyzed separately (old behaviour) + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + """ + #TODO: fix conflicting parameters (?) + + if vba_code_only and not display_code: + display_code = True + + result = {} + + if self.container: + result['container'] = self.container + else: + result['container'] = None + result['file'] = self.filename + result['json_conversion_successful'] = False + result['analysis'] = None + result['code_deobfuscated'] = None + result['do_deobfuscate'] = deobfuscate + + try: + #TODO: handle olefile errors, when an OLE file is malformed + result['type'] = self.type + macros = [] + if self.detect_vba_macros(): + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): + curr_macro = {} + if hide_attributes: + # hide attribute lines: + vba_code_filtered = filter_vba(vba_code.decode('utf-8','replace')) + else: + vba_code_filtered = vba_code + + curr_macro['vba_filename'] = vba_filename + curr_macro['subfilename'] = subfilename + curr_macro['ole_stream'] = stream_path + if display_code: + curr_macro['code'] = vba_code_filtered.strip() + else: + curr_macro['code'] = None + macros.append(curr_macro) + if not vba_code_only: + # analyse the code from all modules at once: + result['analysis'] = self.print_analysis_json(show_decoded_strings, + deobfuscate) + if show_deobfuscated_code: + result['code_deobfuscated'] = self.reveal() + result['macros'] = macros + result['json_conversion_successful'] = True + except Exception as exc: + # display the exception with full stack trace for debugging + log.info('Error processing file %s (%s)' % (self.filename, exc)) + log.debug('Traceback:', exc_info=True) + raise ProcessingError(self.filename, exc) + + return result + + + def process_file_triage(self, show_decoded_strings=False, deobfuscate=False): + """ + Process a file in triage mode, showing only summary results on one line. + """ + #TODO: replace print by writing to a provided output file (sys.stdout by default) + try: + #TODO: handle olefile errors, when an OLE file is malformed + if self.detect_vba_macros(): + # print a waiting message only if the output is not redirected to a file: + if sys.stdout.isatty(): + print('Analysis...\r') + sys.stdout.flush() + self.analyze_macros(show_decoded_strings=show_decoded_strings, + deobfuscate=deobfuscate) + flags = TYPE2TAG[self.type] + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-' + if self.contains_macros: macros = 'M' + if self.nb_autoexec: autoexec = 'A' + if self.nb_suspicious: suspicious = 'S' + if self.nb_iocs: iocs = 'I' + if self.nb_hexstrings: hexstrings = 'H' + if self.nb_base64strings: base64obf = 'B' + if self.nb_dridexstrings: dridex = 'D' + if self.nb_vbastrings: vba_obf = 'V' + flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, + base64obf, dridex, vba_obf) + + line = '%-12s %s' % (flags, self.filename) + print(line) + + # old table display: + # macros = autoexec = suspicious = iocs = hexstrings = 'no' + # if nb_macros: macros = 'YES:%d' % nb_macros + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious + # if nb_iocs: iocs = 'YES:%d' % nb_iocs + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings + # # 2nd line = info + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (self.type, macros, autoexec, suspicious, iocs, hexstrings) + except Exception as exc: + # display the exception with full stack trace for debugging only + log.debug('Error processing file %s (%s)' % (self.filename, exc), + exc_info=True) + raise ProcessingError(self.filename, exc) + + + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), + # header=False, border=False) + # t.align = 'l' + # t.max_width['filename'] = 30 + # t.max_width['type'] = 10 + # t.max_width['macros'] = 6 + # t.max_width['autoexec'] = 6 + # t.max_width['suspicious'] = 6 + # t.max_width['ioc'] = 6 + # t.max_width['hexstrings'] = 6 + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings)) + # print t + + +#=== MAIN ===================================================================== + +def main(): + """ + Main function, called when olevba is run from the command line + """ + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + # parser.add_option('-o', '--outfile', dest='outfile', + # help='output file') + # parser.add_option('-c', '--csv', dest='csv', + # help='export results to a CSV file') + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + # output mode; could make this even simpler with add_option(type='choice') but that would make + # cmd line interface incompatible... + modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)') + modes.add_option("-t", '--triage', action="store_const", dest="output_mode", + const='triage', default='unspecified', + help='triage mode, display results as a summary table (default for multiple files)') + modes.add_option("-d", '--detailed', action="store_const", dest="output_mode", + const='detailed', default='unspecified', + help='detailed mode, display full results (default for single file)') + modes.add_option("-j", '--json', action="store_const", dest="output_mode", + const='json', default='unspecified', + help='json mode, detailed in json format (never default)') + parser.add_option_group(modes) + parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True, + help='display only analysis results, not the macro source code') + parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False, + help='display only VBA source code, do not analyze it') + parser.add_option("--decode", action="store_true", dest="show_decoded_strings", + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).') + parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True, + help='display the attribute lines at the beginning of VBA source code') + parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code", + help='display the macro source code after replacing all the obfuscated strings by their decoded content.') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False, + help="Attempt to deobfuscate VBA expressions (slow)") + parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False, + help="Do not raise errors if opening of substream fails") + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print(__doc__) + parser.print_help() + sys.exit(RETURN_WRONG_ARGS) + + # provide info about tool and its version + if options.output_mode == 'json': + # prints opening [ + print_json(script_name='olevba', version=__version__, + url='http://decalage.info/python/oletools', + type='MetaInformation') + else: + print('olevba %s - http://decalage.info/python/oletools' % __version__) + + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + + # Old display with number of items detected: + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr') + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7) + + # with the option --reveal, make sure --deobf is also enabled: + if options.show_deobfuscated_code and not options.deobfuscate: + log.info('set --deobf because --reveal was set') + options.deobfuscate = True + if options.output_mode == 'triage' and options.show_deobfuscated_code: + log.info('ignoring option --reveal in triage output mode') + + # Column headers (do not know how many files there will be yet, so if no output_mode + # was specified, we will print triage for first file --> need these headers) + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %-65s' % ('Flags', 'Filename')) + print('%-12s %-65s' % ('-' * 11, '-' * 65)) + + previous_container = None + count = 0 + container = filename = data = None + vba_parser = None + return_code = RETURN_OK + try: + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + + # handle errors from xglob + if isinstance(data, Exception): + if isinstance(data, PathNotFoundException): + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - File not found' % ('?', filename)) + elif options.output_mode != 'json': + log.error('Given path %r does not exist!' % filename) + return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \ + else RETURN_SEVERAL_ERRS + else: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container)) + elif options.output_mode != 'json': + log.error('Exception opening/reading %r from zip file %r: %s' + % (filename, container, data)) + return_code = RETURN_XGLOB_ERR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + if options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(data).__name__, message=str(data)) + continue + + try: + # Open the file + vba_parser = VBA_Parser_CLI(filename, data=data, container=container, + relaxed=options.relaxed) + + if options.output_mode == 'detailed': + # fully detailed output + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, + display_code=options.display_code, + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, + show_deobfuscated_code=options.show_deobfuscated_code, + deobfuscate=options.deobfuscate) + elif options.output_mode in ('triage', 'unspecified'): + # print container name when it changes: + if container != previous_container: + if container is not None: + print('\nFiles in %s:' % container) + previous_container = container + # summarized output for triage: + vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, + deobfuscate=options.deobfuscate) + elif options.output_mode == 'json': + print_json( + vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings, + display_code=options.display_code, + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, + show_deobfuscated_code=options.show_deobfuscated_code, + deobfuscate=options.deobfuscate)) + else: # (should be impossible) + raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode)) + count += 1 + + except (SubstreamOpenError, UnexpectedDataError) as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - Error opening substream or uenxpected ' \ + 'content' % ('?', filename)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, message=str(exc)) + else: + log.exception('Error opening substream or unexpected ' + 'content in %s' % filename) + return_code = RETURN_OPEN_ERROR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + except FileOpenError as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - File format not supported' % ('?', filename)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, message=str(exc)) + else: + log.exception('Failed to open %s -- probably not supported!' % filename) + return_code = RETURN_OPEN_ERROR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + except ProcessingError as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, + message=str(exc.orig_exc)) + else: + log.exception('Error processing file %s (%s)!' + % (filename, exc.orig_exc)) + return_code = RETURN_PARSE_ERROR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + finally: + if vba_parser is not None: + vba_parser.close() + + if options.output_mode == 'triage': + print('\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \ + 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ + 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n') + + if count == 1 and options.output_mode == 'unspecified': + # if options -t, -d and -j were not specified and it's a single file, print details: + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, + display_code=options.display_code, + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, + show_deobfuscated_code=options.show_deobfuscated_code, + deobfuscate=options.deobfuscate) + + if options.output_mode == 'json': + # print last json entry (a last one without a comma) and closing ] + print_json(type='MetaInformation', return_code=return_code, + n_processed=count, _json_is_last=True) + + except Exception as exc: + # some unexpected error, maybe some of the types caught in except clauses + # above were not sufficient. This is very bad, so log complete trace at exception level + # and do not care about output mode + log.exception('Unhandled exception in main: %s' % exc, exc_info=True) + return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important + # TODO: print msg with URL to report issues (except in JSON mode) + + # done. exit + log.debug('will exit now with code %s' % return_code) + sys.exit(return_code) + +if __name__ == '__main__': + main() + +# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness diff -Nru remnux-oletools-0.51a/remnux-oletools/olevba.py remnux-oletools-0.51a/remnux-oletools/olevba.py --- remnux-oletools-0.51a/remnux-oletools/olevba.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/olevba.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,3417 @@ +#!/usr/bin/env python +""" +olevba.py + +olevba is a script to parse OLE and OpenXML files such as MS Office documents +(e.g. Word, Excel), to extract VBA Macro code in clear text, deobfuscate +and analyze malicious macros. + +Supported formats: +- Word 97-2003 (.doc, .dot), Word 2007+ (.docm, .dotm) +- Excel 97-2003 (.xls), Excel 2007+ (.xlsm, .xlsb) +- PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) +- Word 2003 XML (.xml) +- Word/Excel Single File Web Page / MHTML (.mht) +- Publisher (.pub) + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation + +olevba is part of the python-oletools package: +http://www.decalage.info/python/oletools + +olevba is based on source code from officeparser by John William Davison +https://github.com/unixfreak0037/officeparser +""" + +# === LICENSE ================================================================== + +# olevba is copyright (c) 2014-2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# olevba contains modified source code from the officeparser project, published +# under the following MIT License (MIT): +# +# officeparser is copyright (c) 2014 John William Davison +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import print_function + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2014-08-05 v0.01 PL: - first version based on officeparser code +# 2014-08-14 v0.02 PL: - fixed bugs in code, added license from officeparser +# 2014-08-15 PL: - fixed incorrect value check in projecthelpfilepath Record +# 2014-08-15 v0.03 PL: - refactored extract_macros to support OpenXML formats +# and to find the VBA project root anywhere in the file +# 2014-11-29 v0.04 PL: - use olefile instead of OleFileIO_PL +# 2014-12-05 v0.05 PL: - refactored most functions into a class, new API +# - added detect_vba_macros +# 2014-12-10 v0.06 PL: - hide first lines with VB attributes +# - detect auto-executable macros +# - ignore empty macros +# 2014-12-14 v0.07 PL: - detect_autoexec() is now case-insensitive +# 2014-12-15 v0.08 PL: - improved display for empty macros +# - added pattern extraction +# 2014-12-25 v0.09 PL: - added suspicious keywords detection +# 2014-12-27 v0.10 PL: - added OptionParser, main and process_file +# - uses xglob to scan several files with wildcards +# - option -r to recurse subdirectories +# - option -z to scan files in password-protected zips +# 2015-01-02 v0.11 PL: - improved filter_vba to detect colons +# 2015-01-03 v0.12 PL: - fixed detect_patterns to detect all patterns +# - process_file: improved display, shows container file +# - improved list of executable file extensions +# 2015-01-04 v0.13 PL: - added several suspicious keywords, improved display +# 2015-01-08 v0.14 PL: - added hex strings detection and decoding +# - fixed issue #2, decoding VBA stream names using +# specified codepage and unicode stream names +# 2015-01-11 v0.15 PL: - added new triage mode, options -t and -d +# 2015-01-16 v0.16 PL: - fix for issue #3 (exception when module name="text") +# - added several suspicious keywords +# - added option -i to analyze VBA source code directly +# 2015-01-17 v0.17 PL: - removed .com from the list of executable extensions +# - added scan_vba to run all detection algorithms +# - decoded hex strings are now also scanned + reversed +# 2015-01-23 v0.18 PL: - fixed issue #3, case-insensitive search in code_modules +# 2015-01-24 v0.19 PL: - improved the detection of IOCs obfuscated with hex +# strings and StrReverse +# 2015-01-26 v0.20 PL: - added option --hex to show all hex strings decoded +# 2015-01-29 v0.21 PL: - added Dridex obfuscation decoding +# - improved display, shows obfuscation name +# 2015-02-01 v0.22 PL: - fixed issue #4: regex for URL, e-mail and exe filename +# - added Base64 obfuscation decoding (contribution from +# @JamesHabben) +# 2015-02-03 v0.23 PL: - triage now uses VBA_Scanner results, shows Base64 and +# Dridex strings +# - exception handling in detect_base64_strings +# 2015-02-07 v0.24 PL: - renamed option --hex to --decode, fixed display +# - display exceptions with stack trace +# - added several suspicious keywords +# - improved Base64 detection and decoding +# - fixed triage mode not to scan attrib lines +# 2015-03-04 v0.25 PL: - added support for Word 2003 XML +# 2015-03-22 v0.26 PL: - added suspicious keywords for sandboxing and +# virtualisation detection +# 2015-05-06 v0.27 PL: - added support for MHTML files with VBA macros +# (issue #10 reported by Greg from SpamStopsHere) +# 2015-05-24 v0.28 PL: - improved support for MHTML files with modified header +# (issue #11 reported by Thomas Chopitea) +# 2015-05-26 v0.29 PL: - improved MSO files parsing, taking into account +# various data offsets (issue #12) +# - improved detection of MSO files, avoiding incorrect +# parsing errors (issue #7) +# 2015-05-29 v0.30 PL: - added suspicious keywords suggested by @ozhermit, +# Davy Douhine (issue #9), issue #13 +# 2015-06-16 v0.31 PL: - added generic VBA expression deobfuscation (chr,asc,etc) +# 2015-06-19 PL: - added options -a, -c, --each, --attr +# 2015-06-21 v0.32 PL: - always display decoded strings which are printable +# - fix VBA_Scanner.scan to return raw strings, not repr() +# 2015-07-09 v0.40 PL: - removed usage of sys.stderr which causes issues +# 2015-07-12 PL: - added Hex function decoding to VBA Parser +# 2015-07-13 PL: - added Base64 function decoding to VBA Parser +# 2015-09-06 PL: - improved VBA_Parser, refactored the main functions +# 2015-09-13 PL: - moved main functions to a class VBA_Parser_CLI +# - fixed issue when analysis was done twice +# 2015-09-15 PL: - remove duplicate IOCs from results +# 2015-09-16 PL: - join long VBA lines ending with underscore before scan +# - disabled unused option --each +# 2015-09-22 v0.41 PL: - added new option --reveal +# - added suspicious strings for PowerShell.exe options +# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method +# 2015-10-10 PL: - added support for text files with VBA source code +# 2015-11-17 PL: - fixed bug with --decode option +# 2015-12-16 PL: - fixed bug in main (no options input anymore) +# - improved logging, added -l option +# 2016-01-31 PL: - fixed issue #31 in VBA_Parser.open_mht +# - fixed issue #32 by monkeypatching email.feedparser +# 2016-02-07 PL: - KeyboardInterrupt is now raised properly +# 2016-02-20 v0.43 PL: - fixed issue #34 in the VBA parser and vba_chr +# 2016-02-29 PL: - added Workbook_Activate to suspicious keywords +# 2016-03-08 v0.44 PL: - added VBA Form strings extraction and analysis +# 2016-03-04 v0.45 CH: - added JSON output (by Christian Herdtweck) +# 2016-03-16 CH: - added option --no-deobfuscate (temporary) +# 2016-04-19 v0.46 PL: - new option --deobf instead of --no-deobfuscate +# - updated suspicious keywords +# 2016-05-04 v0.47 PL: - look for VBA code in any stream including orphans +# 2016-04-28 CH: - return an exit code depending on the results +# - improved error and exception handling +# - improved JSON output +# 2016-05-12 CH: - added support for PowerPoint 97-2003 files +# 2016-06-06 CH: - improved handling of unicode VBA module names +# 2016-06-07 CH: - added option --relaxed, stricter parsing by default +# 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code +# 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6 +# 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding) +# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted +# - detect_autoexec now returns the exact keyword found +# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub) +# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6 +# 2016-09-12 PL: - enabled packrat to improve pyparsing performance +# 2016-10-25 PL: - fixed raise and print statements for Python 3 +# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW + +__version__ = '0.51a' + +#------------------------------------------------------------------------------ +# TODO: +# + setup logging (common with other oletools) +# + add xor bruteforcing like bbharvest +# + options -a and -c should imply -d + +# TODO later: +# + performance improvement: instead of searching each keyword separately, +# first split vba code into a list of words (per line), then check each +# word against a dict. (or put vba words into a set/dict?) +# + for regex, maybe combine them into a single re with named groups? +# + add Yara support, include sample rules? plugins like balbuzard? +# + add balbuzard support +# + output to file (replace print by file.write, sys.stdout by default) +# + look for VBA in embedded documents (e.g. Excel in Word) +# + support SRP streams (see Lenny's article + links and sample) +# - python 3.x support +# - check VBA macros in Visio, Access, Project, etc +# - extract_macros: convert to a class, split long function into smaller methods +# - extract_macros: read bytes from stream file objects instead of strings +# - extract_macros: use combined struct.unpack instead of many calls +# - all except clauses should target specific exceptions + +#------------------------------------------------------------------------------ +# REFERENCES: +# - [MS-OVBA]: Microsoft Office VBA File Format Structure +# http://msdn.microsoft.com/en-us/library/office/cc313094%28v=office.12%29.aspx +# - officeparser: https://github.com/unixfreak0037/officeparser + + +#--- IMPORTS ------------------------------------------------------------------ + +import sys, logging +import struct +import cStringIO +import math +import zipfile +import re +import optparse +import binascii +import base64 +import zlib +import email # for MHTML parsing +import string # for printable +import json # for json output mode (argument --json) + +# import lxml or ElementTree for XML parsing: +try: + # lxml: best performance for XML processing + import lxml.etree as ET +except ImportError: + try: + # Python 2.5+: batteries included + import xml.etree.cElementTree as ET + except ImportError: + try: + # Python <2.5: standalone ElementTree install + import elementtree.cElementTree as ET + except ImportError: + raise ImportError("lxml or ElementTree are not installed, " \ + + "see http://codespeak.net/lxml " \ + + "or http://effbot.org/zone/element-index.htm") + +import thirdparty.olefile as olefile +from thirdparty.prettytable import prettytable +from thirdparty.xglob import xglob, PathNotFoundException +from thirdparty.pyparsing.pyparsing import \ + CaselessKeyword, CaselessLiteral, Combine, Forward, Literal, \ + Optional, QuotedString,Regex, Suppress, Word, WordStart, \ + alphanums, alphas, hexnums,nums, opAssoc, srange, \ + infixNotation, ParserElement +import ppt_parser + +# monkeypatch email to fix issue #32: +# allow header lines without ":" +import email.feedparser +email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])') + +if sys.version_info[0] <= 2: + # Python 2.x + if sys.version_info[1] <= 6: + # Python 2.6 + # use is_zipfile backported from Python 2.7: + from thirdparty.zipfile27 import is_zipfile + else: + # Python 2.7 + from zipfile import is_zipfile +else: + # Python 3.x+ + from zipfile import is_zipfile + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + +# a global logger object used for debugging: +log = get_logger('olevba') + + +#=== EXCEPTIONS ============================================================== + +class OlevbaBaseException(Exception): + """ Base class for exceptions produced here for simpler except clauses """ + def __init__(self, msg, filename=None, orig_exc=None, **kwargs): + if orig_exc: + super(OlevbaBaseException, self).__init__(msg + + ' ({0})'.format(orig_exc), + **kwargs) + else: + super(OlevbaBaseException, self).__init__(msg, **kwargs) + self.msg = msg + self.filename = filename + self.orig_exc = orig_exc + + +class FileOpenError(OlevbaBaseException): + """ raised by VBA_Parser constructor if all open_... attempts failed + + probably means the file type is not supported + """ + + def __init__(self, filename, orig_exc=None): + super(FileOpenError, self).__init__( + 'Failed to open file %s' % filename, filename, orig_exc) + + +class ProcessingError(OlevbaBaseException): + """ raised by VBA_Parser.process_file* functions """ + + def __init__(self, filename, orig_exc): + super(ProcessingError, self).__init__( + 'Error processing file %s' % filename, filename, orig_exc) + + +class MsoExtractionError(RuntimeError, OlevbaBaseException): + """ raised by mso_file_extract if parsing MSO/ActiveMIME data failed """ + + def __init__(self, msg): + MsoExtractionError.__init__(self, msg) + OlevbaBaseException.__init__(self, msg) + + +class SubstreamOpenError(FileOpenError): + """ special kind of FileOpenError: file is a substream of original file """ + + def __init__(self, filename, subfilename, orig_exc=None): + super(SubstreamOpenError, self).__init__( + str(filename) + '/' + str(subfilename), orig_exc) + self.filename = filename # overwrite setting in OlevbaBaseException + self.subfilename = subfilename + + +class UnexpectedDataError(OlevbaBaseException): + """ raised when parsing is strict (=not relaxed) and data is unexpected """ + + def __init__(self, stream_path, variable, expected, value): + super(UnexpectedDataError, self).__init__( + 'Unexpected value in {0} for variable {1}: ' + 'expected {2:04X} but found {3:04X}!' + .format(stream_path, variable, expected, value)) + self.stream_path = stream_path + self.variable = variable + self.expected = expected + self.value = value + +#--- CONSTANTS ---------------------------------------------------------------- + +# return codes +RETURN_OK = 0 +RETURN_WARNINGS = 1 # (reserved, not used yet) +RETURN_WRONG_ARGS = 2 # (fixed, built into optparse) +RETURN_FILE_NOT_FOUND = 3 +RETURN_XGLOB_ERR = 4 +RETURN_OPEN_ERROR = 5 +RETURN_PARSE_ERROR = 6 +RETURN_SEVERAL_ERRS = 7 +RETURN_UNEXPECTED = 8 + +# MAC codepages (from http://stackoverflow.com/questions/1592925/decoding-mac-os-text-in-python) +MAC_CODEPAGES = { + 10000: 'mac-roman', + 10001: 'shiftjis', # not found: 'mac-shift-jis', + 10003: 'ascii', # nothing appropriate found: 'mac-hangul', + 10008: 'gb2321', # not found: 'mac-gb2312', + 10002: 'big5', # not found: 'mac-big5', + 10005: 'hebrew', # not found: 'mac-hebrew', + 10004: 'mac-arabic', + 10006: 'mac-greek', + 10081: 'mac-turkish', + 10021: 'thai', # not found: mac-thai', + 10029: 'maccentraleurope', # not found: 'mac-east europe', + 10007: 'ascii', # nothing appropriate found: 'mac-russian', +} + +# URL and message to report issues: +URL_OLEVBA_ISSUES = 'https://github.com/decalage2/oletools/issues' +MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES + +# Container types: +TYPE_OLE = 'OLE' +TYPE_OpenXML = 'OpenXML' +TYPE_Word2003_XML = 'Word2003_XML' +TYPE_MHTML = 'MHTML' +TYPE_TEXT = 'Text' +TYPE_PPT = 'PPT' + +# short tag to display file types in triage mode: +TYPE2TAG = { + TYPE_OLE: 'OLE:', + TYPE_OpenXML: 'OpX:', + TYPE_Word2003_XML: 'XML:', + TYPE_MHTML: 'MHT:', + TYPE_TEXT: 'TXT:', + TYPE_PPT: 'PPT', +} + + +# MSO files ActiveMime header magic +MSO_ACTIVEMIME_HEADER = 'ActiveMime' + +MODULE_EXTENSION = "bas" +CLASS_EXTENSION = "cls" +FORM_EXTENSION = "frm" + +# Namespaces and tags for Word2003 XML parsing: +NS_W = '{http://schemas.microsoft.com/office/word/2003/wordml}' +# the tag contains the VBA macro code: +TAG_BINDATA = NS_W + 'binData' +ATTR_NAME = NS_W + 'name' + +# Keywords to detect auto-executable macros +AUTOEXEC_KEYWORDS = { + # MS Word: + 'Runs when the Word document is opened': + ('AutoExec', 'AutoOpen', 'DocumentOpen'), + 'Runs when the Word document is closed': + ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), + 'Runs when the Word document is modified': + ('DocumentChange',), + 'Runs when a new Word document is created': + ('AutoNew', 'Document_New', 'NewDocument'), + + # MS Word and Publisher: + 'Runs when the Word or Publisher document is opened': + ('Document_Open',), + 'Runs when the Publisher document is closed': + ('Document_BeforeClose',), + + # MS Excel: + 'Runs when the Excel Workbook is opened': + ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'), + 'Runs when the Excel Workbook is closed': + ('Auto_Close', 'Workbook_Close'), + + # any MS Office application: + 'Runs when the file is opened (using InkPicture ActiveX object)': + # ref:https://twitter.com/joe4security/status/770691099988025345 + (r'\w+_Painted',), + 'Runs when the file is opened and ActiveX objects trigger events': + (r'\w+_(?:GotFocus|LostFocus|MouseHover)',), +} + +# Suspicious Keywords that may be used by malware +# See VBA language reference: http://msdn.microsoft.com/en-us/library/office/jj692818%28v=office.15%29.aspx +SUSPICIOUS_KEYWORDS = { + #TODO: use regex to support variable whitespaces + 'May read system environment variables': + ('Environ',), + 'May open a file': + ('Open',), + 'May write to a file (if combined with Open)': + #TODO: regex to find Open+Write on same line + ('Write', 'Put', 'Output', 'Print #'), + 'May read or write a binary file (if combined with Open)': + #TODO: regex to find Open+Binary on same line + ('Binary',), + 'May copy a file': + ('FileCopy', 'CopyFile'), + #FileCopy: http://msdn.microsoft.com/en-us/library/office/gg264390%28v=office.15%29.aspx + #CopyFile: http://msdn.microsoft.com/en-us/library/office/gg264089%28v=office.15%29.aspx + 'May delete a file': + ('Kill',), + 'May create a text file': + ('CreateTextFile', 'ADODB.Stream', 'WriteText', 'SaveToFile'), + #CreateTextFile: http://msdn.microsoft.com/en-us/library/office/gg264617%28v=office.15%29.aspx + #ADODB.Stream sample: http://pastebin.com/Z4TMyuq6 + 'May run an executable file or a system command': + ('Shell', 'vbNormal', 'vbNormalFocus', 'vbHide', 'vbMinimizedFocus', 'vbMaximizedFocus', 'vbNormalNoFocus', + 'vbMinimizedNoFocus', 'WScript.Shell', 'Run', 'ShellExecute'), + #Shell: http://msdn.microsoft.com/en-us/library/office/gg278437%28v=office.15%29.aspx + #WScript.Shell+Run sample: http://pastebin.com/Z4TMyuq6 + 'May run PowerShell commands': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + #also: https://bitbucket.org/decalage/oletools/issues/14/olevba-library-update-ioc + # ref: https://blog.netspi.com/15-ways-to-bypass-the-powershell-execution-policy/ + # TODO: add support for keywords starting with a non-alpha character, such as "-noexit" + # TODO: '-command', '-EncodedCommand', '-scriptblock' + ('PowerShell', 'noexit', 'ExecutionPolicy', 'noprofile', 'command', 'EncodedCommand', + 'invoke-command', 'scriptblock', 'Invoke-Expression', 'AuthorizationManager'), + 'May run an executable file or a system command using PowerShell': + ('Start-Process',), + 'May hide the application': + ('Application.Visible', 'ShowWindow', 'SW_HIDE'), + 'May create a directory': + ('MkDir',), + 'May save the current workbook': + ('ActiveWorkbook.SaveAs',), + 'May change which directory contains files to open at startup': + #TODO: confirm the actual effect + ('Application.AltStartupPath',), + 'May create an OLE object': + ('CreateObject',), + 'May create an OLE object using PowerShell': + ('New-Object',), + 'May run an application (if combined with CreateObject)': + ('Shell.Application',), + 'May enumerate application windows (if combined with Shell.Application object)': + ('Windows', 'FindWindow'), + 'May run code from a DLL': + #TODO: regex to find declare+lib on same line + ('Lib',), + 'May inject code into another process': + ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload + 'VirtualAllocEx', 'RtlMoveMemory', + ), + 'May run a shellcode in memory': + ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016 + 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx + 'May download files from the Internet': + #TODO: regex to find urlmon+URLDownloadToFileA on same line + ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP', + 'MSXML2.ServerXMLHTTP', # suggested in issue #13 + 'User-Agent', # sample from @ozhermit: http://pastebin.com/MPc3iV6z + ), + 'May download files from the Internet using PowerShell': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + ('Net.WebClient', 'DownloadFile', 'DownloadString'), + 'May control another application by simulating user keystrokes': + ('SendKeys', 'AppActivate'), + #SendKeys: http://msdn.microsoft.com/en-us/library/office/gg278655%28v=office.15%29.aspx + 'May attempt to obfuscate malicious function calls': + ('CallByName',), + #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx + 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)': + #TODO: regex to find several Chr*, not just one + ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), + #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx + 'May read or write registry keys': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + ('RegOpenKeyExA', 'RegOpenKeyEx', 'RegCloseKey'), + 'May read registry keys': + #sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + ('RegQueryValueExA', 'RegQueryValueEx', + 'RegRead', #with Wscript.Shell + ), + 'May detect virtualization': + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + (r'SYSTEM\ControlSet001\Services\Disk\Enum', 'VIRTUAL', 'VMWARE', 'VBOX'), + 'May detect Anubis Sandbox': + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + # NOTES: this sample also checks App.EXEName but that seems to be a bug, it works in VB6 but not in VBA + # ref: http://www.syssec-project.eu/m/page-media/3/disarm-raid11.pdf + ('GetVolumeInformationA', 'GetVolumeInformation', # with kernel32.dll + '1824245000', r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\ProductId', + '76487-337-8429955-22614', 'andy', 'sample', r'C:\exec\exec.exe', 'popupkiller' + ), + 'May detect Sandboxie': + # sample: https://malwr.com/analysis/M2NjZWNmMjA0YjVjNGVhYmJlZmFhNWY4NmQxZDllZTY/ + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('SbieDll.dll', 'SandboxieControlWndClass'), + 'May detect Sunbelt Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + (r'C:\file.exe',), + 'May detect Norman Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('currentuser',), + 'May detect CW Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('Schmidti',), + 'May detect WinJail Sandbox': + # ref: http://www.cplusplus.com/forum/windows/96874/ + ('Afx:400000:0',), +} + +# Regular Expression for a URL: +# http://en.wikipedia.org/wiki/Uniform_resource_locator +# http://www.w3.org/Addressing/URL/uri-spec.html +#TODO: also support username:password@server +#TODO: other protocols (file, gopher, wais, ...?) +SCHEME = r'\b(?:http|ftp)s?' +# see http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains +TLD = r'(?:xn--[a-zA-Z0-9]{4,20}|[a-zA-Z]{2,20})' +DNS_NAME = r'(?:[a-zA-Z0-9\-\.]+\.' + TLD + ')' +#TODO: IPv6 - see https://www.debuggex.com/ +# A literal numeric IPv6 address may be given, but must be enclosed in [ ] e.g. [db8:0cec::99:123a] +NUMBER_0_255 = r'(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])' +IPv4 = r'(?:' + NUMBER_0_255 + r'\.){3}' + NUMBER_0_255 +# IPv4 must come before the DNS name because it is more specific +SERVER = r'(?:' + IPv4 + '|' + DNS_NAME + ')' +PORT = r'(?:\:[0-9]{1,5})?' +SERVER_PORT = SERVER + PORT +URL_PATH = r'(?:/[a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~]*)?' # [^\.\,\)\(\s"] +URL_RE = SCHEME + r'\://' + SERVER_PORT + URL_PATH +re_url = re.compile(URL_RE) + + +# Patterns to be extracted (IP addresses, URLs, etc) +# From patterns.py in balbuzard +RE_PATTERNS = ( + ('URL', re.compile(URL_RE)), + ('IPv4 address', re.compile(IPv4)), + # TODO: add IPv6 + ('E-mail address', re.compile(r'(?i)\b[A-Z0-9._%+-]+@' + SERVER + '\b')), + # ('Domain name', re.compile(r'(?=^.{1,254}$)(^(?:(?!\d+\.|-)[a-zA-Z0-9_\-]{1,63}(? char +vba_chr = Suppress( + Combine(WordStart(vba_identifier_chars) + CaselessLiteral('Chr') + + Optional(CaselessLiteral('B') | CaselessLiteral('W')) + Optional('$')) + + '(') + vba_expr_int + Suppress(')') + +def vba_chr_tostr(t): + try: + i = t[0] + # normal, non-unicode character: + if i>=0 and i<=255: + return VbaExpressionString(chr(i)) + else: + return VbaExpressionString(unichr(i).encode('utf-8', 'backslashreplace')) + except ValueError: + log.exception('ERROR: incorrect parameter value for chr(): %r' % i) + return VbaExpressionString('Chr(%r)' % i) + +vba_chr.setParseAction(vba_chr_tostr) + + +# --- ASC -------------------------------------------------------------------- + +# Asc(char) => int +#TODO: see MS-VBAL 6.1.2.11.1.1 page 240 => AscB, AscW +vba_asc = Suppress(CaselessKeyword('Asc') + '(') + vba_expr_str + Suppress(')') +vba_asc.setParseAction(lambda t: ord(t[0])) + + +# --- VAL -------------------------------------------------------------------- + +# Val(string) => int +# TODO: make sure the behavior of VBA's val is fully covered +vba_val = Suppress(CaselessKeyword('Val') + '(') + vba_expr_str + Suppress(')') +vba_val.setParseAction(lambda t: int(t[0].strip())) + + +# --- StrReverse() -------------------------------------------------------------------- + +# StrReverse(string) => string +strReverse = Suppress(CaselessKeyword('StrReverse') + '(') + vba_expr_str + Suppress(')') +strReverse.setParseAction(lambda t: VbaExpressionString(str(t[0])[::-1])) + + +# --- ENVIRON() -------------------------------------------------------------------- + +# Environ("name") => just translated to "%name%", that is enough for malware analysis +environ = Suppress(CaselessKeyword('Environ') + '(') + vba_expr_str + Suppress(')') +environ.setParseAction(lambda t: VbaExpressionString('%%%s%%' % t[0])) + + +# --- IDENTIFIER ------------------------------------------------------------- + +#TODO: see MS-VBAL 3.3.5 page 33 +# 3.3.5 Identifier Tokens +# Latin-identifier = first-Latin-identifier-character *subsequent-Latin-identifier-character +# first-Latin-identifier-character = (%x0041-005A / %x0061-007A) ; A-Z / a-z +# subsequent-Latin-identifier-character = first-Latin-identifier-character / DIGIT / %x5F ; underscore +latin_identifier = Word(initChars=alphas, bodyChars=alphanums + '_') + +# --- HEX FUNCTION ----------------------------------------------------------- + +# match any custom function name with a hex string as argument: +# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime + +# quoted string of at least two hexadecimal numbers of two digits: +quoted_hex_string = Suppress('"') + Combine(Word(hexnums, exact=2) * (2, None)) + Suppress('"') +quoted_hex_string.setParseAction(lambda t: str(t[0])) + +hex_function_call = Suppress(latin_identifier) + Suppress('(') + \ + quoted_hex_string('hex_string') + Suppress(')') +hex_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_hex(t.hex_string))) + + +# --- BASE64 FUNCTION ----------------------------------------------------------- + +# match any custom function name with a Base64 string as argument: +# TODO: accept vba_expr_str_item as argument, check if it is a hex or base64 string at runtime + +# quoted string of at least two hexadecimal numbers of two digits: +quoted_base64_string = Suppress('"') + Regex(BASE64_RE) + Suppress('"') +quoted_base64_string.setParseAction(lambda t: str(t[0])) + +base64_function_call = Suppress(latin_identifier) + Suppress('(') + \ + quoted_base64_string('base64_string') + Suppress(')') +base64_function_call.setParseAction(lambda t: VbaExpressionString(binascii.a2b_base64(t.base64_string))) + + +# ---STRING EXPRESSION ------------------------------------------------------- + +def concat_strings_list(tokens): + """ + parse action to concatenate strings in a VBA expression with operators '+' or '&' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of strings such as [a,'&',b,'&',c,...] + strings = tokens[0][::2] + return VbaExpressionString(''.join(strings)) + + +vba_expr_str_item = (vba_chr | strReverse | environ | quoted_string | hex_function_call | base64_function_call) + +vba_expr_str <<= infixNotation(vba_expr_str_item, + [ + ("+", 2, opAssoc.LEFT, concat_strings_list), + ("&", 2, opAssoc.LEFT, concat_strings_list), + ]) + + +# --- INTEGER EXPRESSION ------------------------------------------------------- + +def sum_ints_list(tokens): + """ + parse action to sum integers in a VBA expression with operator '+' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return sum(integers) + + +def subtract_ints_list(tokens): + """ + parse action to subtract integers in a VBA expression with operator '-' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return reduce(lambda x,y:x-y, integers) + + +def multiply_ints_list(tokens): + """ + parse action to multiply integers in a VBA expression with operator '*' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return reduce(lambda x,y:x*y, integers) + + +def divide_ints_list(tokens): + """ + parse action to divide integers in a VBA expression with operator '/' + """ + # extract argument from the tokens: + # expected to be a tuple containing a list of integers such as [a,'&',b,'&',c,...] + integers = tokens[0][::2] + return reduce(lambda x,y:x/y, integers) + + +vba_expr_int_item = (vba_asc | vba_val | integer) + +# operators associativity: +# https://en.wikipedia.org/wiki/Operator_associativity + +vba_expr_int <<= infixNotation(vba_expr_int_item, + [ + ("*", 2, opAssoc.LEFT, multiply_ints_list), + ("/", 2, opAssoc.LEFT, divide_ints_list), + ("-", 2, opAssoc.LEFT, subtract_ints_list), + ("+", 2, opAssoc.LEFT, sum_ints_list), + ]) + + +# see detect_vba_strings for the deobfuscation code using this grammar + +# === MSO/ActiveMime files parsing =========================================== + +def is_mso_file(data): + """ + Check if the provided data is the content of a MSO/ActiveMime file, such as + the ones created by Outlook in some cases, or Word/Excel when saving a + file with the MHTML format or the Word 2003 XML format. + This function only checks the ActiveMime magic at the beginning of data. + :param data: bytes string, MSO/ActiveMime file content + :return: bool, True if the file is MSO, False otherwise + """ + return data.startswith(MSO_ACTIVEMIME_HEADER) + + +# regex to find zlib block headers, starting with byte 0x78 = 'x' +re_zlib_header = re.compile(r'x') + + +def mso_file_extract(data): + """ + Extract the data stored into a MSO/ActiveMime file, such as + the ones created by Outlook in some cases, or Word/Excel when saving a + file with the MHTML format or the Word 2003 XML format. + + :param data: bytes string, MSO/ActiveMime file content + :return: bytes string, extracted data (uncompressed) + + raise a MsoExtractionError if the data cannot be extracted + """ + # check the magic: + assert is_mso_file(data) + + # In all the samples seen so far, Word always uses an offset of 0x32, + # and Excel 0x22A. But we read the offset from the header to be more + # generic. + offsets = [0x32, 0x22A] + + # First, attempt to get the compressed data offset from the header + # According to my tests, it should be an unsigned 16 bits integer, + # at offset 0x1E (little endian) + add 46: + try: + offset = struct.unpack_from('> bit_count + offset_mask = ~length_mask + maximum_length = (0xFFFF >> bit_count) + 3 + return length_mask, offset_mask, bit_count, maximum_length + + +def decompress_stream(compressed_container): + """ + Decompress a stream according to MS-OVBA section 2.4.1 + + compressed_container: string compressed according to the MS-OVBA 2.4.1.3.6 Compression algorithm + return the decompressed container as a string (bytes) + """ + # 2.4.1.2 State Variables + + # The following state is maintained for the CompressedContainer (section 2.4.1.1.1): + # CompressedRecordEnd: The location of the byte after the last byte in the CompressedContainer (section 2.4.1.1.1). + # CompressedCurrent: The location of the next byte in the CompressedContainer (section 2.4.1.1.1) to be read by + # decompression or to be written by compression. + + # The following state is maintained for the current CompressedChunk (section 2.4.1.1.4): + # CompressedChunkStart: The location of the first byte of the CompressedChunk (section 2.4.1.1.4) within the + # CompressedContainer (section 2.4.1.1.1). + + # The following state is maintained for a DecompressedBuffer (section 2.4.1.1.2): + # DecompressedCurrent: The location of the next byte in the DecompressedBuffer (section 2.4.1.1.2) to be written by + # decompression or to be read by compression. + # DecompressedBufferEnd: The location of the byte after the last byte in the DecompressedBuffer (section 2.4.1.1.2). + + # The following state is maintained for the current DecompressedChunk (section 2.4.1.1.3): + # DecompressedChunkStart: The location of the first byte of the DecompressedChunk (section 2.4.1.1.3) within the + # DecompressedBuffer (section 2.4.1.1.2). + + decompressed_container = '' # result + compressed_current = 0 + + sig_byte = ord(compressed_container[compressed_current]) + if sig_byte != 0x01: + raise ValueError('invalid signature byte {0:02X}'.format(sig_byte)) + + compressed_current += 1 + + #NOTE: the definition of CompressedRecordEnd is ambiguous. Here we assume that + # CompressedRecordEnd = len(compressed_container) + while compressed_current < len(compressed_container): + # 2.4.1.1.5 + compressed_chunk_start = compressed_current + # chunk header = first 16 bits + compressed_chunk_header = \ + struct.unpack("> 12) & 0x07 + if chunk_signature != 0b011: + raise ValueError('Invalid CompressedChunkSignature in VBA compressed stream') + # chunk flag = next bit - 1 == compressed, 0 == uncompressed + chunk_flag = (compressed_chunk_header >> 15) & 0x01 + log.debug("chunk size = {0}, compressed flag = {1}".format(chunk_size, chunk_flag)) + + #MS-OVBA 2.4.1.3.12: the maximum size of a chunk including its header is 4098 bytes (header 2 + data 4096) + # The minimum size is 3 bytes + # NOTE: there seems to be a typo in MS-OVBA, the check should be with 4098, not 4095 (which is the max value + # in chunk header before adding 3. + # Also the first test is not useful since a 12 bits value cannot be larger than 4095. + if chunk_flag == 1 and chunk_size > 4098: + raise ValueError('CompressedChunkSize > 4098 but CompressedChunkFlag == 1') + if chunk_flag == 0 and chunk_size != 4098: + raise ValueError('CompressedChunkSize != 4098 but CompressedChunkFlag == 0') + + # check if chunk_size goes beyond the compressed data, instead of silently cutting it: + #TODO: raise an exception? + if compressed_chunk_start + chunk_size > len(compressed_container): + log.warning('Chunk size is larger than remaining compressed data') + compressed_end = min([len(compressed_container), compressed_chunk_start + chunk_size]) + # read after chunk header: + compressed_current = compressed_chunk_start + 2 + + if chunk_flag == 0: + # MS-OVBA 2.4.1.3.3 Decompressing a RawChunk + # uncompressed chunk: read the next 4096 bytes as-is + #TODO: check if there are at least 4096 bytes left + decompressed_container += compressed_container[compressed_current:compressed_current + 4096] + compressed_current += 4096 + else: + # MS-OVBA 2.4.1.3.2 Decompressing a CompressedChunk + # compressed chunk + decompressed_chunk_start = len(decompressed_container) + while compressed_current < compressed_end: + # MS-OVBA 2.4.1.3.4 Decompressing a TokenSequence + # log.debug('compressed_current = %d / compressed_end = %d' % (compressed_current, compressed_end)) + # FlagByte: 8 bits indicating if the following 8 tokens are either literal (1 byte of plain text) or + # copy tokens (reference to a previous literal token) + flag_byte = ord(compressed_container[compressed_current]) + compressed_current += 1 + for bit_index in xrange(0, 8): + # log.debug('bit_index=%d / compressed_current=%d / compressed_end=%d' % (bit_index, compressed_current, compressed_end)) + if compressed_current >= compressed_end: + break + # MS-OVBA 2.4.1.3.5 Decompressing a Token + # MS-OVBA 2.4.1.3.17 Extract FlagBit + flag_bit = (flag_byte >> bit_index) & 1 + #log.debug('bit_index=%d: flag_bit=%d' % (bit_index, flag_bit)) + if flag_bit == 0: # LiteralToken + # copy one byte directly to output + decompressed_container += compressed_container[compressed_current] + compressed_current += 1 + else: # CopyToken + # MS-OVBA 2.4.1.3.19.2 Unpack CopyToken + copy_token = \ + struct.unpack("> temp2) + 1 + #log.debug('offset=%d length=%d' % (offset, length)) + copy_source = len(decompressed_container) - offset + for index in xrange(copy_source, copy_source + length): + decompressed_container += decompressed_container[index] + compressed_current += 2 + return decompressed_container + + +def _extract_vba(ole, vba_root, project_path, dir_path, relaxed=False): + """ + Extract VBA macros from an OleFileIO object. + Internal function, do not call directly. + + vba_root: path to the VBA root storage, containing the VBA storage and the PROJECT stream + vba_project: path to the PROJECT stream + :param relaxed: If True, only create info/debug log entry if data is not as expected + (e.g. opening substream fails); if False, raise an error in this case + This is a generator, yielding (stream path, VBA filename, VBA source code) for each VBA code stream + """ + # Open the PROJECT stream: + project = ole.openstream(project_path) + log.debug('relaxed is %s' % relaxed) + + # sample content of the PROJECT stream: + + ## ID="{5312AC8A-349D-4950-BDD0-49BE3C4DD0F0}" + ## Document=ThisDocument/&H00000000 + ## Module=NewMacros + ## Name="Project" + ## HelpContextID="0" + ## VersionCompatible32="393222000" + ## CMG="F1F301E705E705E705E705" + ## DPB="8F8D7FE3831F2020202020" + ## GC="2D2FDD81E51EE61EE6E1" + ## + ## [Host Extender Info] + ## &H00000001={3832D640-CF90-11CF-8E43-00A0C911005A};VBE;&H00000000 + ## &H00000002={000209F2-0000-0000-C000-000000000046};Word8.0;&H00000000 + ## + ## [Workspace] + ## ThisDocument=22, 29, 339, 477, Z + ## NewMacros=-4, 42, 832, 510, C + + code_modules = {} + + for line in project: + line = line.strip() + if '=' in line: + # split line at the 1st equal sign: + name, value = line.split('=', 1) + # looking for code modules + # add the code module as a key in the dictionary + # the value will be the extension needed later + # The value is converted to lowercase, to allow case-insensitive matching (issue #3) + value = value.lower() + if name == 'Document': + # split value at the 1st slash, keep 1st part: + value = value.split('/', 1)[0] + code_modules[value] = CLASS_EXTENSION + elif name == 'Module': + code_modules[value] = MODULE_EXTENSION + elif name == 'Class': + code_modules[value] = CLASS_EXTENSION + elif name == 'BaseClass': + code_modules[value] = FORM_EXTENSION + + # read data from dir stream (compressed) + dir_compressed = ole.openstream(dir_path).read() + + def check_value(name, expected, value): + if expected != value: + if relaxed: + log.error("invalid value for {0} expected {1:04X} got {2:04X}" + .format(name, expected, value)) + else: + raise UnexpectedDataError(dir_path, name, expected, value) + + dir_stream = cStringIO.StringIO(decompress_stream(dir_compressed)) + + # PROJECTSYSKIND Record + projectsyskind_id = struct.unpack(" 128: + log.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(projectname_sizeof_projectname)) + projectname_projectname = dir_stream.read(projectname_sizeof_projectname) + unused = projectname_projectname + + # PROJECTDOCSTRING Record + projectdocstring_id = struct.unpack(" 2000: + log.error( + "PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(projectdocstring_sizeof_docstring)) + projectdocstring_docstring = dir_stream.read(projectdocstring_sizeof_docstring) + projectdocstring_reserved = struct.unpack(" 260: + log.error( + "PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(projecthelpfilepath_sizeof_helpfile1)) + projecthelpfilepath_helpfile1 = dir_stream.read(projecthelpfilepath_sizeof_helpfile1) + projecthelpfilepath_reserved = struct.unpack(" 1015: + log.error( + "PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(projectconstants_sizeof_constants)) + projectconstants_constants = dir_stream.read(projectconstants_sizeof_constants) + projectconstants_reserved = struct.unpack(" 0: + code_data = decompress_stream(code_data) + # case-insensitive search in the code_modules dict to find the file extension: + filext = code_modules.get(modulename_modulename.lower(), 'bin') + filename = '{0}.{1}'.format(modulename_modulename, filext) + #TODO: also yield the codepage so that callers can decode it properly + yield (code_path, filename, code_data) + # print '-'*79 + # print filename + # print '' + # print code_data + # print '' + log.debug('extracted file {0}'.format(filename)) + else: + log.warning("module stream {0} has code data length 0".format(modulestreamname_streamname)) + except (UnexpectedDataError, SubstreamOpenError): + raise + except Exception as exc: + log.info('Error parsing module {0} of {1} in _extract_vba:' + .format(projectmodule_index, projectmodules_count), + exc_info=True) + if not relaxed: + raise + _ = unused # make pylint happy: now variable "unused" is being used ;-) + return + + +def vba_collapse_long_lines(vba_code): + """ + Parse a VBA module code to detect continuation line characters (underscore) and + collapse split lines. Continuation line characters are replaced by spaces. + + :param vba_code: str, VBA module code + :return: str, VBA module code with long lines collapsed + """ + # TODO: use a regex instead, to allow whitespaces after the underscore? + vba_code = vba_code.replace(' _\r\n', ' ') + vba_code = vba_code.replace(' _\r', ' ') + vba_code = vba_code.replace(' _\n', ' ') + return vba_code + + +def filter_vba(vba_code): + """ + Filter VBA source code to remove the first lines starting with "Attribute VB_", + which are automatically added by MS Office and not displayed in the VBA Editor. + This should only be used when displaying source code for human analysis. + + Note: lines are not filtered if they contain a colon, because it could be + used to hide malicious instructions. + + :param vba_code: str, VBA source code + :return: str, filtered VBA source code + """ + vba_lines = vba_code.splitlines() + start = 0 + for line in vba_lines: + if line.startswith("Attribute VB_") and not ':' in line: + start += 1 + else: + break + #TODO: also remove empty lines? + vba = '\n'.join(vba_lines[start:]) + return vba + + +def detect_autoexec(vba_code, obfuscation=None): + """ + Detect if the VBA code contains keywords corresponding to macros running + automatically when triggered by specific actions (e.g. when a document is + opened or closed). + + :param vba_code: str, VBA source code + :param obfuscation: None or str, name of obfuscation to be added to description + :return: list of str tuples (keyword, description) + """ + #TODO: merge code with detect_suspicious + # case-insensitive search + #vba_code = vba_code.lower() + results = [] + obf_text = '' + if obfuscation: + obf_text = ' (obfuscation: %s)' % obfuscation + for description, keywords in AUTOEXEC_KEYWORDS.items(): + for keyword in keywords: + #TODO: if keyword is already a compiled regex, use it as-is + # search using regex to detect word boundaries: + match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) + if match: + #if keyword.lower() in vba_code: + found_keyword = match.group() + results.append((found_keyword, description + obf_text)) + return results + + +def detect_suspicious(vba_code, obfuscation=None): + """ + Detect if the VBA code contains suspicious keywords corresponding to + potential malware behaviour. + + :param vba_code: str, VBA source code + :param obfuscation: None or str, name of obfuscation to be added to description + :return: list of str tuples (keyword, description) + """ + # case-insensitive search + #vba_code = vba_code.lower() + results = [] + obf_text = '' + if obfuscation: + obf_text = ' (obfuscation: %s)' % obfuscation + for description, keywords in SUSPICIOUS_KEYWORDS.items(): + for keyword in keywords: + # search using regex to detect word boundaries: + match = re.search(r'(?i)\b' + keyword + r'\b', vba_code) + if match: + #if keyword.lower() in vba_code: + found_keyword = match.group() + results.append((found_keyword, description + obf_text)) + return results + + +def detect_patterns(vba_code, obfuscation=None): + """ + Detect if the VBA code contains specific patterns such as IP addresses, + URLs, e-mail addresses, executable file names, etc. + + :param vba_code: str, VBA source code + :return: list of str tuples (pattern type, value) + """ + results = [] + found = set() + obf_text = '' + if obfuscation: + obf_text = ' (obfuscation: %s)' % obfuscation + for pattern_type, pattern_re in RE_PATTERNS: + for match in pattern_re.finditer(vba_code): + value = match.group() + if value not in found: + results.append((pattern_type + obf_text, value)) + found.add(value) + return results + + +def detect_hex_strings(vba_code): + """ + Detect if the VBA code contains strings encoded in hexadecimal. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + results = [] + found = set() + for match in re_hex_string.finditer(vba_code): + value = match.group() + if value not in found: + decoded = binascii.unhexlify(value) + results.append((value, decoded)) + found.add(value) + return results + + +def detect_base64_strings(vba_code): + """ + Detect if the VBA code contains strings encoded in base64. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + #TODO: avoid matching simple hex strings as base64? + results = [] + found = set() + for match in re_base64_string.finditer(vba_code): + # extract the base64 string without quotes: + value = match.group().strip('"') + # check it is not just a hex string: + if not re_nothex_check.search(value): + continue + # only keep new values and not in the whitelist: + if value not in found and value.lower() not in BASE64_WHITELIST: + try: + decoded = base64.b64decode(value) + results.append((value, decoded)) + found.add(value) + except (TypeError, ValueError) as exc: + log.debug('Failed to base64-decode (%s)' % exc) + # if an exception occurs, it is likely not a base64-encoded string + return results + + +def detect_dridex_strings(vba_code): + """ + Detect if the VBA code contains strings obfuscated with a specific algorithm found in Dridex samples. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + from thirdparty.DridexUrlDecoder.DridexUrlDecoder import DridexUrlDecode + + results = [] + found = set() + for match in re_dridex_string.finditer(vba_code): + value = match.group()[1:-1] + # check it is not just a hex string: + if not re_nothex_check.search(value): + continue + if value not in found: + try: + decoded = DridexUrlDecode(value) + results.append((value, decoded)) + found.add(value) + except Exception as exc: + log.debug('Failed to Dridex-decode (%s)' % exc) + # if an exception occurs, it is likely not a dridex-encoded string + return results + + +def detect_vba_strings(vba_code): + """ + Detect if the VBA code contains strings obfuscated with VBA expressions + using keywords such as Chr, Asc, Val, StrReverse, etc. + + :param vba_code: str, VBA source code + :return: list of str tuples (encoded string, decoded string) + """ + # TODO: handle exceptions + results = [] + found = set() + # IMPORTANT: to extract the actual VBA expressions found in the code, + # we must expand tabs to have the same string as pyparsing. + # Otherwise, start and end offsets are incorrect. + vba_code = vba_code.expandtabs() + for tokens, start, end in vba_expr_str.scanString(vba_code): + encoded = vba_code[start:end] + decoded = tokens[0] + if isinstance(decoded, VbaExpressionString): + # This is a VBA expression, not a simple string + # print 'VBA EXPRESSION: encoded=%r => decoded=%r' % (encoded, decoded) + # remove parentheses and quotes from original string: + # if encoded.startswith('(') and encoded.endswith(')'): + # encoded = encoded[1:-1] + # if encoded.startswith('"') and encoded.endswith('"'): + # encoded = encoded[1:-1] + # avoid duplicates and simple strings: + if encoded not in found and decoded != encoded: + results.append((encoded, decoded)) + found.add(encoded) + # else: + # print 'VBA STRING: encoded=%r => decoded=%r' % (encoded, decoded) + return results + + +def json2ascii(json_obj, encoding='utf8', errors='replace'): + """ ensure there is no unicode in json and all strings are safe to decode + + works recursively, decodes and re-encodes every string to/from unicode + to ensure there will be no trouble in loading the dumped json output + """ + if json_obj is None: + pass + elif isinstance(json_obj, (bool, int, float)): + pass + elif isinstance(json_obj, str): + # de-code and re-encode + dencoded = json_obj.decode(encoding, errors).encode(encoding, errors) + if dencoded != json_obj: + log.debug('json2ascii: replaced: {0} (len {1})' + .format(json_obj, len(json_obj))) + log.debug('json2ascii: with: {0} (len {1})' + .format(dencoded, len(dencoded))) + return dencoded + elif isinstance(json_obj, unicode): + log.debug('json2ascii: encode unicode: {0}' + .format(json_obj.encode(encoding, errors))) + # cannot put original into logger + # print 'original: ' json_obj + return json_obj.encode(encoding, errors) + elif isinstance(json_obj, dict): + for key in json_obj: + json_obj[key] = json2ascii(json_obj[key]) + elif isinstance(json_obj, (list,tuple)): + for item in json_obj: + item = json2ascii(item) + else: + log.debug('unexpected type in json2ascii: {0} -- leave as is' + .format(type(json_obj))) + return json_obj + + +_have_printed_json_start = False + +def print_json(json_dict=None, _json_is_last=False, **json_parts): + """ line-wise print of json.dumps(json2ascii(..)) with options and indent+1 + + can use in two ways: + (1) print_json(some_dict) + (2) print_json(key1=value1, key2=value2, ...) + + :param bool _json_is_last: set to True only for very last entry to complete + the top-level json-list + """ + global _have_printed_json_start + + if json_dict and json_parts: + raise ValueError('Invalid json argument: want either single dict or ' + 'key=value parts but got both)') + elif (json_dict is not None) and (not isinstance(json_dict, dict)): + raise ValueError('Invalid json argument: want either single dict or ' + 'key=value parts but got {0} instead of dict)' + .format(type(json_dict))) + if json_parts: + json_dict = json_parts + + if not _have_printed_json_start: + print('[') + _have_printed_json_start = True + + lines = json.dumps(json2ascii(json_dict), check_circular=False, + indent=4, ensure_ascii=False).splitlines() + for line in lines[:-1]: + print(' {0}'.format(line)) + if _json_is_last: + print(' {0}'.format(lines[-1])) # print last line without comma + print(']') + else: + print(' {0},'.format(lines[-1])) # print last line with comma + + +class VBA_Scanner(object): + """ + Class to scan the source code of a VBA module to find obfuscated strings, + suspicious keywords, IOCs, auto-executable macros, etc. + """ + + def __init__(self, vba_code): + """ + VBA_Scanner constructor + + :param vba_code: str, VBA source code to be analyzed + """ + # join long lines ending with " _": + self.code = vba_collapse_long_lines(vba_code) + self.code_hex = '' + self.code_hex_rev = '' + self.code_rev_hex = '' + self.code_base64 = '' + self.code_dridex = '' + self.code_vba = '' + self.strReverse = None + # results = None before scanning, then a list of tuples after scanning + self.results = None + self.autoexec_keywords = None + self.suspicious_keywords = None + self.iocs = None + self.hex_strings = None + self.base64_strings = None + self.dridex_strings = None + self.vba_strings = None + + + def scan(self, include_decoded_strings=False, deobfuscate=False): + """ + Analyze the provided VBA code to detect suspicious keywords, + auto-executable macros, IOC patterns, obfuscation patterns + such as hex-encoded strings. + + :param include_decoded_strings: bool, if True, all encoded strings will be included with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + :return: list of tuples (type, keyword, description) + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') + """ + # First, detect and extract hex-encoded strings: + self.hex_strings = detect_hex_strings(self.code) + # detect if the code contains StrReverse: + self.strReverse = False + if 'strreverse' in self.code.lower(): self.strReverse = True + # Then append the decoded strings to the VBA code, to detect obfuscated IOCs and keywords: + for encoded, decoded in self.hex_strings: + self.code_hex += '\n' + decoded + # if the code contains "StrReverse", also append the hex strings in reverse order: + if self.strReverse: + # StrReverse after hex decoding: + self.code_hex_rev += '\n' + decoded[::-1] + # StrReverse before hex decoding: + self.code_rev_hex += '\n' + binascii.unhexlify(encoded[::-1]) + #example: https://malwr.com/analysis/NmFlMGI4YTY1YzYyNDkwNTg1ZTBiZmY5OGI3YjlhYzU/ + #TODO: also append the full code reversed if StrReverse? (risk of false positives?) + # Detect Base64-encoded strings + self.base64_strings = detect_base64_strings(self.code) + for encoded, decoded in self.base64_strings: + self.code_base64 += '\n' + decoded + # Detect Dridex-encoded strings + self.dridex_strings = detect_dridex_strings(self.code) + for encoded, decoded in self.dridex_strings: + self.code_dridex += '\n' + decoded + # Detect obfuscated strings in VBA expressions + if deobfuscate: + self.vba_strings = detect_vba_strings(self.code) + else: + self.vba_strings = [] + for encoded, decoded in self.vba_strings: + self.code_vba += '\n' + decoded + results = [] + self.autoexec_keywords = [] + self.suspicious_keywords = [] + self.iocs = [] + + for code, obfuscation in ( + (self.code, None), + (self.code_hex, 'Hex'), + (self.code_hex_rev, 'Hex+StrReverse'), + (self.code_rev_hex, 'StrReverse+Hex'), + (self.code_base64, 'Base64'), + (self.code_dridex, 'Dridex'), + (self.code_vba, 'VBA expression'), + ): + self.autoexec_keywords += detect_autoexec(code, obfuscation) + self.suspicious_keywords += detect_suspicious(code, obfuscation) + self.iocs += detect_patterns(code, obfuscation) + + # If hex-encoded strings were discovered, add an item to suspicious keywords: + if self.hex_strings: + self.suspicious_keywords.append(('Hex Strings', + 'Hex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) + if self.base64_strings: + self.suspicious_keywords.append(('Base64 Strings', + 'Base64-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) + if self.dridex_strings: + self.suspicious_keywords.append(('Dridex Strings', + 'Dridex-encoded strings were detected, may be used to obfuscate strings (option --decode to see all)')) + if self.vba_strings: + self.suspicious_keywords.append(('VBA obfuscated Strings', + 'VBA string expressions were detected, may be used to obfuscate strings (option --decode to see all)')) + # use a set to avoid duplicate keywords + keyword_set = set() + for keyword, description in self.autoexec_keywords: + if keyword not in keyword_set: + results.append(('AutoExec', keyword, description)) + keyword_set.add(keyword) + keyword_set = set() + for keyword, description in self.suspicious_keywords: + if keyword not in keyword_set: + results.append(('Suspicious', keyword, description)) + keyword_set.add(keyword) + keyword_set = set() + for pattern_type, value in self.iocs: + if value not in keyword_set: + results.append(('IOC', value, pattern_type)) + keyword_set.add(value) + + # include decoded strings only if they are printable or if --decode option: + for encoded, decoded in self.hex_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('Hex String', decoded, encoded)) + for encoded, decoded in self.base64_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('Base64 String', decoded, encoded)) + for encoded, decoded in self.dridex_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('Dridex string', decoded, encoded)) + for encoded, decoded in self.vba_strings: + if include_decoded_strings or is_printable(decoded): + results.append(('VBA string', decoded, encoded)) + self.results = results + return results + + def scan_summary(self): + """ + Analyze the provided VBA code to detect suspicious keywords, + auto-executable macros, IOC patterns, obfuscation patterns + such as hex-encoded strings. + + :return: tuple with the number of items found for each category: + (autoexec, suspicious, IOCs, hex, base64, dridex, vba) + """ + # avoid scanning the same code twice: + if self.results is None: + self.scan() + return (len(self.autoexec_keywords), len(self.suspicious_keywords), + len(self.iocs), len(self.hex_strings), len(self.base64_strings), + len(self.dridex_strings), len(self.vba_strings)) + + +def scan_vba(vba_code, include_decoded_strings, deobfuscate=False): + """ + Analyze the provided VBA code to detect suspicious keywords, + auto-executable macros, IOC patterns, obfuscation patterns + such as hex-encoded strings. + (shortcut for VBA_Scanner(vba_code).scan()) + + :param vba_code: str, VBA source code to be analyzed + :param include_decoded_strings: bool, if True all encoded strings will be included with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + :return: list of tuples (type, keyword, description) + (type = 'AutoExec', 'Suspicious', 'IOC', 'Hex String', 'Base64 String' or 'Dridex String') + """ + return VBA_Scanner(vba_code).scan(include_decoded_strings, deobfuscate) + + +#=== CLASSES ================================================================= + +class VBA_Parser(object): + """ + Class to parse MS Office files, to detect VBA macros and extract VBA source code + Supported file formats: + - Word 97-2003 (.doc, .dot) + - Word 2007+ (.docm, .dotm) + - Word 2003 XML (.xml) + - Word MHT - Single File Web Page / MHTML (.mht) + - Excel 97-2003 (.xls) + - Excel 2007+ (.xlsm, .xlsb) + - PowerPoint 97-2003 (.ppt) + - PowerPoint 2007+ (.pptm, .ppsm) + """ + + def __init__(self, filename, data=None, container=None, relaxed=False): + """ + Constructor for VBA_Parser + + :param filename: filename or path of file to parse, or file-like object + + :param data: None or bytes str, if None the file will be read from disk (or from the file-like object). + If data is provided as a bytes string, it will be parsed as the content of the file in memory, + and not read from disk. Note: files must be read in binary mode, i.e. open(f, 'rb'). + + :param container: str, path and filename of container if the file is within + a zip archive, None otherwise. + + :param relaxed: if True, treat mal-formed documents and missing streams more like MS office: + do nothing; if False (default), raise errors in these cases + + raises a FileOpenError if all attemps to interpret the data header failed + """ + #TODO: filename should only be a string, data should be used for the file-like object + #TODO: filename should be mandatory, optional data is a string or file-like object + #TODO: also support olefile and zipfile as input + if data is None: + # open file from disk: + _file = filename + else: + # file already read in memory, make it a file-like object for zipfile: + _file = cStringIO.StringIO(data) + #self.file = _file + self.ole_file = None + self.ole_subfiles = [] + self.filename = filename + self.container = container + self.relaxed = relaxed + self.type = None + self.vba_projects = None + self.vba_forms = None + self.contains_macros = None # will be set to True or False by detect_macros + self.vba_code_all_modules = None # to store the source code of all modules + # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) + self.modules = None + # Analysis results: list of tuples (type, keyword, description) - See VBA_Scanner + self.analysis_results = None + # statistics for the scan summary and flags + self.nb_macros = 0 + self.nb_autoexec = 0 + self.nb_suspicious = 0 + self.nb_iocs = 0 + self.nb_hexstrings = 0 + self.nb_base64strings = 0 + self.nb_dridexstrings = 0 + self.nb_vbastrings = 0 + + # if filename is None: + # if isinstance(_file, basestring): + # if len(_file) < olefile.MINIMAL_OLEFILE_SIZE: + # self.filename = _file + # else: + # self.filename = '' + # else: + # self.filename = '' + if olefile.isOleFile(_file): + # This looks like an OLE file + self.open_ole(_file) + + # if this worked, try whether it is a ppt file (special ole file) + self.open_ppt() + if self.type is None and is_zipfile(_file): + # Zip file, which may be an OpenXML document + self.open_openxml(_file) + if self.type is None: + # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, + # or a plain text file containing VBA code + if data is None: + data = open(filename, 'rb').read() + # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace + if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: + self.open_word2003xml(data) + # store a lowercase version for the next tests: + data_lowercase = data.lower() + # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): + # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line + # BUT Word accepts a blank line or other MIME headers inserted before, + # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored. + # And the line is case insensitive. + # so we'll just check the presence of mime, version and multipart anywhere: + if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase \ + and 'multipart' in data_lowercase: + self.open_mht(data) + #TODO: handle exceptions + #TODO: Excel 2003 XML + # Check if this is a plain text VBA or VBScript file: + # To avoid scanning binary files, we simply check for some control chars: + if self.type is None and '\x00' not in data: + self.open_text(data) + if self.type is None: + # At this stage, could not match a known format: + msg = '%s is not a supported file type, cannot extract VBA Macros.' % self.filename + log.info(msg) + raise FileOpenError(msg) + + def open_ole(self, _file): + """ + Open an OLE file + :param _file: filename or file contents in a file object + :return: nothing + """ + log.info('Opening OLE file %s' % self.filename) + try: + # Open and parse the OLE file, using unicode for path names: + self.ole_file = olefile.OleFileIO(_file, path_encoding=None) + # set type only if parsing succeeds + self.type = TYPE_OLE + except (IOError, TypeError, ValueError) as exc: + # TODO: handle OLE parsing exceptions + log.info('Failed OLE parsing for file %r (%s)' % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + + + def open_openxml(self, _file): + """ + Open an OpenXML file + :param _file: filename or file contents in a file object + :return: nothing + """ + # This looks like a zip file, need to look for vbaProject.bin inside + # It can be any OLE file inside the archive + #...because vbaProject.bin can be renamed: + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18 + log.info('Opening ZIP/OpenXML file %s' % self.filename) + try: + z = zipfile.ZipFile(_file) + #TODO: check if this is actually an OpenXML file + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically + # check each file within the zip if it is an OLE file, by reading its magic: + for subfile in z.namelist(): + magic = z.open(subfile).read(len(olefile.MAGIC)) + if magic == olefile.MAGIC: + log.debug('Opening OLE file %s within zip' % subfile) + ole_data = z.open(subfile).read() + try: + self.ole_subfiles.append( + VBA_Parser(filename=subfile, data=ole_data, + relaxed=self.relaxed)) + except OlevbaBaseException as exc: + if self.relaxed: + log.info('%s is not a valid OLE file (%s)' % (subfile, exc)) + log.debug('Trace:', exc_info=True) + continue + else: + raise SubstreamOpenError(self.filename, subfile, + exc) + z.close() + # set type only if parsing succeeds + self.type = TYPE_OpenXML + except OlevbaBaseException as exc: + if self.relaxed: + log.info('Error {0} caught in Zip/OpenXML parsing for file {1}' + .format(exc, self.filename)) + log.debug('Trace:', exc_info=True) + else: + raise + except (RuntimeError, zipfile.BadZipfile, zipfile.LargeZipFile, IOError) as exc: + # TODO: handle parsing exceptions + log.info('Failed Zip/OpenXML parsing for file %r (%s)' + % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + + def open_word2003xml(self, data): + """ + Open a Word 2003 XML file + :param data: file contents in a string or bytes + :return: nothing + """ + log.info('Opening Word 2003 XML file %s' % self.filename) + try: + # parse the XML content + # TODO: handle XML parsing exceptions + et = ET.fromstring(data) + # find all the binData elements: + for bindata in et.getiterator(TAG_BINDATA): + # the binData content is an OLE container for the VBA project, compressed + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. + # get the filename: + fname = bindata.get(ATTR_NAME, 'noname.mso') + # decode the base64 activemime + mso_data = binascii.a2b_base64(bindata.text) + if is_mso_file(mso_data): + # decompress the zlib data stored in the MSO file, which is the OLE container: + # TODO: handle different offsets => separate function + try: + ole_data = mso_file_extract(mso_data) + self.ole_subfiles.append( + VBA_Parser(filename=fname, data=ole_data, + relaxed=self.relaxed)) + except OlevbaBaseException as exc: + if self.relaxed: + log.info('Error parsing subfile {0}: {1}' + .format(fname, exc)) + log.debug('Trace:', exc_info=True) + else: + raise SubstreamOpenError(self.filename, fname, exc) + else: + log.info('%s is not a valid MSO file' % fname) + # set type only if parsing succeeds + self.type = TYPE_Word2003_XML + except OlevbaBaseException as exc: + if self.relaxed: + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + else: + raise + except Exception as exc: + # TODO: differentiate exceptions for each parsing stage + # (but ET is different libs, no good exception description in API) + # found: XMLSyntaxError + log.info('Failed XML parsing for file %r (%s)' % (self.filename, exc)) + log.debug('Trace:', exc_info=True) + + def open_mht(self, data): + """ + Open a MHTML file + :param data: file contents in a string or bytes + :return: nothing + """ + log.info('Opening MHTML file %s' % self.filename) + try: + # parse the MIME content + # remove any leading whitespace or newline (workaround for issue in email package) + stripped_data = data.lstrip('\r\n\t ') + # strip any junk from the beginning of the file + # (issue #31 fix by Greg C - gdigreg) + # TODO: improve keywords to avoid false positives + mime_offset = stripped_data.find('MIME') + content_offset = stripped_data.find('Content') + # if "MIME" is found, and located before "Content": + if -1 < mime_offset <= content_offset: + stripped_data = stripped_data[mime_offset:] + # else if "Content" is found, and before "MIME" + # TODO: can it work without "MIME" at all? + elif content_offset > -1: + stripped_data = stripped_data[content_offset:] + # TODO: quick and dirty fix: insert a standard line with MIME-Version header? + mhtml = email.message_from_string(stripped_data) + # find all the attached files: + for part in mhtml.walk(): + content_type = part.get_content_type() # always returns a value + fname = part.get_filename(None) # returns None if it fails + # TODO: get content-location if no filename + log.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type)) + part_data = part.get_payload(decode=True) + # VBA macros are stored in a binary file named "editdata.mso". + # the data content is an OLE container for the VBA project, compressed + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded. + # decompress the zlib data starting at offset 0x32, which is the OLE container: + # check ActiveMime header: + if isinstance(part_data, str) and is_mso_file(part_data): + log.debug('Found ActiveMime header, decompressing MSO container') + try: + ole_data = mso_file_extract(part_data) + + # TODO: check if it is actually an OLE file + # TODO: get the MSO filename from content_location? + self.ole_subfiles.append( + VBA_Parser(filename=fname, data=ole_data, + relaxed=self.relaxed)) + except OlevbaBaseException as exc: + if self.relaxed: + log.info('%s does not contain a valid OLE file (%s)' + % (fname, exc)) + log.debug('Trace:', exc_info=True) + # TODO: bug here - need to split in smaller functions/classes? + else: + raise SubstreamOpenError(self.filename, fname, exc) + else: + log.debug('type(part_data) = %s' % type(part_data)) + try: + log.debug('part_data[0:20] = %r' % part_data[0:20]) + except TypeError as err: + log.debug('part_data has no __getitem__') + # set type only if parsing succeeds + self.type = TYPE_MHTML + except OlevbaBaseException: + raise + except Exception: + log.info('Failed MIME parsing for file %r - %s' + % (self.filename, MSG_OLEVBA_ISSUES)) + log.debug('Trace:', exc_info=True) + + def open_ppt(self): + """ try to interpret self.ole_file as PowerPoint 97-2003 using PptParser + + Although self.ole_file is a valid olefile.OleFileIO, we set + self.ole_file = None in here and instead set self.ole_subfiles to the + VBA ole streams found within the main ole file. That makes most of the + code below treat this like an OpenXML file and only look at the + ole_subfiles (except find_vba_* which needs to explicitly check for + self.type) + """ + + log.info('Check whether OLE file is PPT') + ppt_parser.enable_logging() + try: + ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) + for vba_data in ppt.iter_vba_data(): + self.ole_subfiles.append(VBA_Parser(None, vba_data, + container='PptParser')) + log.info('File is PPT') + self.ole_file.close() # just in case + self.ole_file = None # required to make other methods look at ole_subfiles + self.type = TYPE_PPT + except Exception as exc: + if self.container == 'PptParser': + # this is a subfile of a ppt --> to be expected that is no ppt + log.debug('PPT subfile is not a PPT file') + else: + log.debug("File appears not to be a ppt file (%s)" % exc) + + + def open_text(self, data): + """ + Open a text file containing VBA or VBScript source code + :param data: file contents in a string or bytes + :return: nothing + """ + log.info('Opening text file %s' % self.filename) + # directly store the source code: + self.vba_code_all_modules = data + self.contains_macros = True + # set type only if parsing succeeds + self.type = TYPE_TEXT + + + def find_vba_projects(self): + """ + Finds all the VBA projects stored in an OLE file. + + Return None if the file is not OLE but OpenXML. + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. + vba_root is the path of the root OLE storage containing the VBA project, + including a trailing slash unless it is the root of the OLE file. + project_path is the path of the OLE stream named "PROJECT" within the VBA project. + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. + + If this function returns an empty list for one of the supported formats + (i.e. Word, Excel, Powerpoint), then the file does not contain VBA macros. + + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) + for each VBA project found if OLE file + """ + log.debug('VBA_Parser.find_vba_projects') + + # if the file is not OLE but OpenXML, return None: + if self.ole_file is None and self.type != TYPE_PPT: + return None + + # if this method has already been called, return previous result: + if self.vba_projects is not None: + return self.vba_projects + + # if this is a ppt file (PowerPoint 97-2003): + # self.ole_file is None but the ole_subfiles do contain vba_projects + # (like for OpenXML files). + if self.type == TYPE_PPT: + # TODO: so far, this function is never called for PPT files, but + # if that happens, the information is lost which ole file contains + # which storage! + log.warning('Returned info is not complete for PPT types!') + self.vba_projects = [] + for subfile in self.ole_subfiles: + self.vba_projects.extend(subfile.find_vba_projects()) + return self.vba_projects + + # Find the VBA project root (different in MS Word, Excel, etc): + # - Word 97-2003: Macros + # - Excel 97-2003: _VBA_PROJECT_CUR + # - PowerPoint 97-2003: PptParser has identified ole_subfiles + # - Word 2007+: word/vbaProject.bin in zip archive, then the VBA project is the root of vbaProject.bin. + # - Excel 2007+: xl/vbaProject.bin in zip archive, then same as Word + # - PowerPoint 2007+: ppt/vbaProject.bin in zip archive, then same as Word + # - Visio 2007: not supported yet (different file structure) + + # According to MS-OVBA section 2.2.1: + # - the VBA project root storage MUST contain a VBA storage and a PROJECT stream + # - The root/VBA storage MUST contain a _VBA_PROJECT stream and a dir stream + # - all names are case-insensitive + + def check_vba_stream(ole, vba_root, stream_path): + full_path = vba_root + stream_path + if ole.exists(full_path) and ole.get_type(full_path) == olefile.STGTY_STREAM: + log.debug('Found %s stream: %s' % (stream_path, full_path)) + return full_path + else: + log.debug('Missing %s stream, this is not a valid VBA project structure' % stream_path) + return False + + # start with an empty list: + self.vba_projects = [] + # Look for any storage containing those storage/streams: + ole = self.ole_file + for storage in ole.listdir(streams=False, storages=True): + log.debug('Checking storage %r' % storage) + # Look for a storage ending with "VBA": + if storage[-1].upper() == 'VBA': + log.debug('Found VBA storage: %s' % ('/'.join(storage))) + vba_root = '/'.join(storage[:-1]) + # Add a trailing slash to vba_root, unless it is the root of the OLE file: + # (used later to append all the child streams/storages) + if vba_root != '': + vba_root += '/' + log.debug('Checking vba_root="%s"' % vba_root) + + # Check if the VBA root storage also contains a PROJECT stream: + project_path = check_vba_stream(ole, vba_root, 'PROJECT') + if not project_path: continue + # Check if the VBA root storage also contains a VBA/_VBA_PROJECT stream: + vba_project_path = check_vba_stream(ole, vba_root, 'VBA/_VBA_PROJECT') + if not vba_project_path: continue + # Check if the VBA root storage also contains a VBA/dir stream: + dir_path = check_vba_stream(ole, vba_root, 'VBA/dir') + if not dir_path: continue + # Now we are pretty sure it is a VBA project structure + log.debug('VBA root storage: "%s"' % vba_root) + # append the results to the list as a tuple for later use: + self.vba_projects.append((vba_root, project_path, dir_path)) + return self.vba_projects + + def detect_vba_macros(self): + """ + Detect the potential presence of VBA macros in the file, by checking + if it contains VBA projects. Both OLE and OpenXML files are supported. + + Important: for now, results are accurate only for Word, Excel and PowerPoint + + Note: this method does NOT attempt to check the actual presence or validity + of VBA macro source code, so there might be false positives. + It may also detect VBA macros in files embedded within the main file, + for example an Excel workbook with macros embedded into a Word + document without macros may be detected, without distinction. + + :return: bool, True if at least one VBA project has been found, False otherwise + """ + #TODO: return None or raise exception if format not supported + #TODO: return the number of VBA projects found instead of True/False? + # if this method was already called, return the previous result: + if self.contains_macros is not None: + return self.contains_macros + # if OpenXML/PPT, check all the OLE subfiles: + if self.ole_file is None: + for ole_subfile in self.ole_subfiles: + if ole_subfile.detect_vba_macros(): + self.contains_macros = True + return True + # otherwise, no macro found: + self.contains_macros = False + return False + # otherwise it's an OLE file, find VBA projects: + vba_projects = self.find_vba_projects() + if len(vba_projects) == 0: + self.contains_macros = False + else: + self.contains_macros = True + # Also look for VBA code in any stream including orphans + # (happens in some malformed files) + ole = self.ole_file + for sid in xrange(len(ole.direntries)): + # check if id is already done above: + log.debug('Checking DirEntry #%d' % sid) + d = ole.direntries[sid] + if d is None: + # this direntry is not part of the tree: either unused or an orphan + d = ole._load_direntry(sid) + log.debug('This DirEntry is an orphan or unused') + if d.entry_type == olefile.STGTY_STREAM: + # read data + log.debug('Reading data from stream %r - size: %d bytes' % (d.name, d.size)) + try: + data = ole._open(d.isectStart, d.size).read() + log.debug('Read %d bytes' % len(data)) + if len(data) > 200: + log.debug('%r...[much more data]...%r' % (data[:100], data[-50:])) + else: + log.debug(repr(data)) + if 'Attribut' in data: + log.debug('Found VBA compressed code') + self.contains_macros = True + except IOError as exc: + if self.relaxed: + log.info('Error when reading OLE Stream %r' % d.name) + log.debug('Trace:', exc_trace=True) + else: + raise SubstreamOpenError(self.filename, d.name, exc) + return self.contains_macros + + def extract_macros(self): + """ + Extract and decompress source code for each VBA macro found in the file + + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found + If the file is OLE, filename is the path of the file. + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros + within the zip archive, e.g. word/vbaProject.bin. + If the file is PPT, result is as for OpenXML but filename is useless + """ + log.debug('extract_macros:') + if self.ole_file is None: + # This may be either an OpenXML/PPT or a text file: + if self.type == TYPE_TEXT: + # This is a text file, yield the full code: + yield (self.filename, '', self.filename, self.vba_code_all_modules) + else: + # OpenXML/PPT: recursively yield results from each OLE subfile: + for ole_subfile in self.ole_subfiles: + for results in ole_subfile.extract_macros(): + yield results + else: + # This is an OLE file: + self.find_vba_projects() + # set of stream ids + vba_stream_ids = set() + for vba_root, project_path, dir_path in self.vba_projects: + # extract all VBA macros from that VBA root storage: + for stream_path, vba_filename, vba_code in \ + _extract_vba(self.ole_file, vba_root, project_path, + dir_path, self.relaxed): + # store direntry ids in a set: + vba_stream_ids.add(self.ole_file._find(stream_path)) + yield (self.filename, stream_path, vba_filename, vba_code) + # Also look for VBA code in any stream including orphans + # (happens in some malformed files) + ole = self.ole_file + for sid in xrange(len(ole.direntries)): + # check if id is already done above: + log.debug('Checking DirEntry #%d' % sid) + if sid in vba_stream_ids: + log.debug('Already extracted') + continue + d = ole.direntries[sid] + if d is None: + # this direntry is not part of the tree: either unused or an orphan + d = ole._load_direntry(sid) + log.debug('This DirEntry is an orphan or unused') + if d.entry_type == olefile.STGTY_STREAM: + # read data + log.debug('Reading data from stream %r' % d.name) + data = ole._open(d.isectStart, d.size).read() + for match in re.finditer(r'\x00Attribut[^e]', data, flags=re.IGNORECASE): + start = match.start() - 3 + log.debug('Found VBA compressed code at index %X' % start) + compressed_code = data[start:] + try: + vba_code = decompress_stream(compressed_code) + yield (self.filename, d.name, d.name, vba_code) + except Exception as exc: + # display the exception with full stack trace for debugging + log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc)) + log.debug('Traceback:', exc_info=True) + # do not raise the error, as it is unlikely to be a compressed macro stream + + def extract_all_macros(self): + """ + Extract and decompress source code for each VBA macro found in the file + by calling extract_macros(), store the results as a list of tuples + (filename, stream_path, vba_filename, vba_code) in self.modules. + See extract_macros for details. + """ + if self.modules is None: + self.modules = [] + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_macros(): + self.modules.append((subfilename, stream_path, vba_filename, vba_code)) + self.nb_macros = len(self.modules) + return self.modules + + + + def analyze_macros(self, show_decoded_strings=False, deobfuscate=False): + """ + runs extract_macros and analyze the source code of all VBA macros + found in the file. + """ + if self.detect_vba_macros(): + # if the analysis was already done, avoid doing it twice: + if self.analysis_results is not None: + return self.analysis_results + # variable to merge source code from all modules: + if self.vba_code_all_modules is None: + self.vba_code_all_modules = '' + for (_, _, _, vba_code) in self.extract_all_macros(): + #TODO: filter code? (each module) + self.vba_code_all_modules += vba_code + '\n' + for (_, _, form_string) in self.extract_form_strings(): + self.vba_code_all_modules += form_string + '\n' + # Analyze the whole code at once: + scanner = VBA_Scanner(self.vba_code_all_modules) + self.analysis_results = scanner.scan(show_decoded_strings, deobfuscate) + autoexec, suspicious, iocs, hexstrings, base64strings, dridex, vbastrings = scanner.scan_summary() + self.nb_autoexec += autoexec + self.nb_suspicious += suspicious + self.nb_iocs += iocs + self.nb_hexstrings += hexstrings + self.nb_base64strings += base64strings + self.nb_dridexstrings += dridex + self.nb_vbastrings += vbastrings + + return self.analysis_results + + + def reveal(self): + # we only want printable strings: + analysis = self.analyze_macros(show_decoded_strings=False) + # to avoid replacing short strings contained into longer strings, we sort the analysis results + # based on the length of the encoded string, in reverse order: + analysis = sorted(analysis, key=lambda type_decoded_encoded: len(type_decoded_encoded[2]), reverse=True) + # normally now self.vba_code_all_modules contains source code from all modules + deobf_code = self.vba_code_all_modules + for kw_type, decoded, encoded in analysis: + if kw_type == 'VBA string': + #print '%3d occurences: %r => %r' % (deobf_code.count(encoded), encoded, decoded) + # need to add double quotes around the decoded strings + # after escaping double-quotes as double-double-quotes for VBA: + decoded = decoded.replace('"', '""') + deobf_code = deobf_code.replace(encoded, '"%s"' % decoded) + return deobf_code + #TODO: repasser l'analyse plusieurs fois si des chaines hex ou base64 sont revelees + + + def find_vba_forms(self): + """ + Finds all the VBA forms stored in an OLE file. + + Return None if the file is not OLE but OpenXML. + Return a list of tuples (vba_root, project_path, dir_path) for each VBA project. + vba_root is the path of the root OLE storage containing the VBA project, + including a trailing slash unless it is the root of the OLE file. + project_path is the path of the OLE stream named "PROJECT" within the VBA project. + dir_path is the path of the OLE stream named "VBA/dir" within the VBA project. + + If this function returns an empty list for one of the supported formats + (i.e. Word, Excel, Powerpoint), then the file does not contain VBA forms. + + :return: None if OpenXML file, list of tuples (vba_root, project_path, dir_path) + for each VBA project found if OLE file + """ + log.debug('VBA_Parser.find_vba_forms') + + # if the file is not OLE but OpenXML, return None: + if self.ole_file is None and self.type != TYPE_PPT: + return None + + # if this method has already been called, return previous result: + # if self.vba_projects is not None: + # return self.vba_projects + + # According to MS-OFORMS section 2.1.2 Control Streams: + # - A parent control, that is, a control that can contain embedded controls, + # MUST be persisted as a storage that contains multiple streams. + # - All parent controls MUST contain a FormControl. The FormControl + # properties are persisted to a stream (1) as specified in section 2.1.1.2. + # The name of this stream (1) MUST be "f". + # - Embedded controls that cannot themselves contain other embedded + # controls are persisted sequentially as FormEmbeddedActiveXControls + # to a stream (1) contained in the same storage as the parent control. + # The name of this stream (1) MUST be "o". + # - all names are case-insensitive + + if self.type == TYPE_PPT: + # TODO: so far, this function is never called for PPT files, but + # if that happens, the information is lost which ole file contains + # which storage! + ole_files = self.ole_subfiles + log.warning('Returned info is not complete for PPT types!') + else: + ole_files = [self.ole_file, ] + + # start with an empty list: + self.vba_forms = [] + + # Loop over ole streams + for ole in ole_files: + # Look for any storage containing those storage/streams: + for storage in ole.listdir(streams=False, storages=True): + log.debug('Checking storage %r' % storage) + # Look for two streams named 'o' and 'f': + o_stream = storage + ['o'] + f_stream = storage + ['f'] + log.debug('Checking if streams %r and %r exist' % (f_stream, o_stream)) + if ole.exists(o_stream) and ole.get_type(o_stream) == olefile.STGTY_STREAM \ + and ole.exists(f_stream) and ole.get_type(f_stream) == olefile.STGTY_STREAM: + form_path = '/'.join(storage) + log.debug('Found VBA Form: %r' % form_path) + self.vba_forms.append(storage) + return self.vba_forms + + def extract_form_strings(self): + """ + Extract printable strings from each VBA Form found in the file + + Iterator: yields (filename, stream_path, vba_filename, vba_code) for each VBA macro found + If the file is OLE, filename is the path of the file. + If the file is OpenXML, filename is the path of the OLE subfile containing VBA macros + within the zip archive, e.g. word/vbaProject.bin. + If the file is PPT, result is as for OpenXML but filename is useless + """ + if self.ole_file is None: + # This may be either an OpenXML/PPT or a text file: + if self.type == TYPE_TEXT: + # This is a text file, return no results: + return + else: + # OpenXML/PPT: recursively yield results from each OLE subfile: + for ole_subfile in self.ole_subfiles: + for results in ole_subfile.extract_form_strings(): + yield results + else: + # This is an OLE file: + self.find_vba_forms() + ole = self.ole_file + for form_storage in self.vba_forms: + o_stream = form_storage + ['o'] + log.debug('Opening form object stream %r' % '/'.join(o_stream)) + form_data = ole.openstream(o_stream).read() + # Extract printable strings from the form object stream "o": + for m in re_printable_string.finditer(form_data): + log.debug('Printable string found in form: %r' % m.group()) + yield (self.filename, '/'.join(o_stream), m.group()) + + + def close(self): + """ + Close all the open files. This method must be called after usage, if + the application is opening many files. + """ + if self.ole_file is None: + if self.ole_subfiles is not None: + for ole_subfile in self.ole_subfiles: + ole_subfile.close() + else: + self.ole_file.close() + + + +class VBA_Parser_CLI(VBA_Parser): + """ + VBA parser and analyzer, adding methods for the command line interface + of olevba. (see VBA_Parser) + """ + + def __init__(self, *args, **kwargs): + """ + Constructor for VBA_Parser_CLI. + Calls __init__ from VBA_Parser with all arguments --> see doc there + """ + super(VBA_Parser_CLI, self).__init__(*args, **kwargs) + + + def print_analysis(self, show_decoded_strings=False, deobfuscate=False): + """ + Analyze the provided VBA code, and print the results in a table + + :param vba_code: str, VBA source code to be analyzed + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + :return: None + """ + # print a waiting message only if the output is not redirected to a file: + if sys.stdout.isatty(): + print('Analysis...\r', end='') + sys.stdout.flush() + results = self.analyze_macros(show_decoded_strings, deobfuscate) + if results: + t = prettytable.PrettyTable(('Type', 'Keyword', 'Description')) + t.align = 'l' + t.max_width['Type'] = 10 + t.max_width['Keyword'] = 20 + t.max_width['Description'] = 39 + for kw_type, keyword, description in results: + # handle non printable strings: + if not is_printable(keyword): + keyword = repr(keyword) + if not is_printable(description): + description = repr(description) + t.add_row((kw_type, keyword, description)) + print(t) + else: + print('No suspicious keyword or IOC found.') + + def print_analysis_json(self, show_decoded_strings=False, deobfuscate=False): + """ + Analyze the provided VBA code, and return the results in json format + + :param vba_code: str, VBA source code to be analyzed + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + + :return: dict + """ + # print a waiting message only if the output is not redirected to a file: + if sys.stdout.isatty(): + print('Analysis...\r', end='') + sys.stdout.flush() + return [dict(type=kw_type, keyword=keyword, description=description) + for kw_type, keyword, description in self.analyze_macros(show_decoded_strings, deobfuscate)] + + def process_file(self, show_decoded_strings=False, + display_code=True, hide_attributes=True, + vba_code_only=False, show_deobfuscated_code=False, + deobfuscate=False): + """ + Process a single file + + :param filename: str, path and filename of file on disk, or within the container. + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param display_code: bool, if False VBA source code is not displayed (default True) + :param global_analysis: bool, if True all modules are merged for a single analysis (default), + otherwise each module is analyzed separately (old behaviour) + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + """ + #TODO: replace print by writing to a provided output file (sys.stdout by default) + # fix conflicting parameters: + if vba_code_only and not display_code: + display_code = True + if self.container: + display_filename = '%s in %s' % (self.filename, self.container) + else: + display_filename = self.filename + print('=' * 79) + print('FILE: %s' % display_filename) + try: + #TODO: handle olefile errors, when an OLE file is malformed + print('Type: %s'% self.type) + if self.detect_vba_macros(): + #print 'Contains VBA Macros:' + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): + if hide_attributes: + # hide attribute lines: + vba_code_filtered = filter_vba(vba_code) + else: + vba_code_filtered = vba_code + print('-' * 79) + print('VBA MACRO %s ' % vba_filename) + print('in file: %s - OLE stream: %s' % (subfilename, repr(stream_path))) + if display_code: + print('- ' * 39) + # detect empty macros: + if vba_code_filtered.strip() == '': + print('(empty macro)') + else: + print(vba_code_filtered) + for (subfilename, stream_path, form_string) in self.extract_form_strings(): + print('-' * 79) + print('VBA FORM STRING IN %r - OLE stream: %r' % (subfilename, stream_path)) + print('- ' * 39) + print(form_string) + if not vba_code_only: + # analyse the code from all modules at once: + self.print_analysis(show_decoded_strings, deobfuscate) + if show_deobfuscated_code: + print('MACRO SOURCE CODE WITH DEOBFUSCATED VBA STRINGS (EXPERIMENTAL):\n\n') + print(self.reveal()) + else: + print('No VBA macros found.') + except OlevbaBaseException: + raise + except Exception as exc: + # display the exception with full stack trace for debugging + log.info('Error processing file %s (%s)' % (self.filename, exc)) + log.debug('Traceback:', exc_info=True) + raise ProcessingError(self.filename, exc) + print('') + + + def process_file_json(self, show_decoded_strings=False, + display_code=True, hide_attributes=True, + vba_code_only=False, show_deobfuscated_code=False, + deobfuscate=False): + """ + Process a single file + + every "show" or "print" here is to be translated as "add to json" + + :param filename: str, path and filename of file on disk, or within the container. + :param data: bytes, content of the file if it is in a container, None if it is a file on disk. + :param show_decoded_strings: bool, if True hex-encoded strings will be displayed with their decoded content. + :param display_code: bool, if False VBA source code is not displayed (default True) + :param global_analysis: bool, if True all modules are merged for a single analysis (default), + otherwise each module is analyzed separately (old behaviour) + :param hide_attributes: bool, if True the first lines starting with "Attribute VB" are hidden (default) + :param deobfuscate: bool, if True attempt to deobfuscate VBA expressions (slow) + """ + #TODO: fix conflicting parameters (?) + + if vba_code_only and not display_code: + display_code = True + + result = {} + + if self.container: + result['container'] = self.container + else: + result['container'] = None + result['file'] = self.filename + result['json_conversion_successful'] = False + result['analysis'] = None + result['code_deobfuscated'] = None + result['do_deobfuscate'] = deobfuscate + + try: + #TODO: handle olefile errors, when an OLE file is malformed + result['type'] = self.type + macros = [] + if self.detect_vba_macros(): + for (subfilename, stream_path, vba_filename, vba_code) in self.extract_all_macros(): + curr_macro = {} + if hide_attributes: + # hide attribute lines: + vba_code_filtered = filter_vba(vba_code) + else: + vba_code_filtered = vba_code + + curr_macro['vba_filename'] = vba_filename + curr_macro['subfilename'] = subfilename + curr_macro['ole_stream'] = stream_path + if display_code: + curr_macro['code'] = vba_code_filtered.strip() + else: + curr_macro['code'] = None + macros.append(curr_macro) + if not vba_code_only: + # analyse the code from all modules at once: + result['analysis'] = self.print_analysis_json(show_decoded_strings, + deobfuscate) + if show_deobfuscated_code: + result['code_deobfuscated'] = self.reveal() + result['macros'] = macros + result['json_conversion_successful'] = True + except Exception as exc: + # display the exception with full stack trace for debugging + log.info('Error processing file %s (%s)' % (self.filename, exc)) + log.debug('Traceback:', exc_info=True) + raise ProcessingError(self.filename, exc) + + return result + + + def process_file_triage(self, show_decoded_strings=False, deobfuscate=False): + """ + Process a file in triage mode, showing only summary results on one line. + """ + #TODO: replace print by writing to a provided output file (sys.stdout by default) + try: + #TODO: handle olefile errors, when an OLE file is malformed + if self.detect_vba_macros(): + # print a waiting message only if the output is not redirected to a file: + if sys.stdout.isatty(): + print('Analysis...\r', end='') + sys.stdout.flush() + self.analyze_macros(show_decoded_strings=show_decoded_strings, + deobfuscate=deobfuscate) + flags = TYPE2TAG[self.type] + macros = autoexec = suspicious = iocs = hexstrings = base64obf = dridex = vba_obf = '-' + if self.contains_macros: macros = 'M' + if self.nb_autoexec: autoexec = 'A' + if self.nb_suspicious: suspicious = 'S' + if self.nb_iocs: iocs = 'I' + if self.nb_hexstrings: hexstrings = 'H' + if self.nb_base64strings: base64obf = 'B' + if self.nb_dridexstrings: dridex = 'D' + if self.nb_vbastrings: vba_obf = 'V' + flags += '%s%s%s%s%s%s%s%s' % (macros, autoexec, suspicious, iocs, hexstrings, + base64obf, dridex, vba_obf) + + line = '%-12s %s' % (flags, self.filename) + print(line) + + # old table display: + # macros = autoexec = suspicious = iocs = hexstrings = 'no' + # if nb_macros: macros = 'YES:%d' % nb_macros + # if nb_autoexec: autoexec = 'YES:%d' % nb_autoexec + # if nb_suspicious: suspicious = 'YES:%d' % nb_suspicious + # if nb_iocs: iocs = 'YES:%d' % nb_iocs + # if nb_hexstrings: hexstrings = 'YES:%d' % nb_hexstrings + # # 2nd line = info + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % (self.type, macros, autoexec, suspicious, iocs, hexstrings) + except Exception as exc: + # display the exception with full stack trace for debugging only + log.debug('Error processing file %s (%s)' % (self.filename, exc), + exc_info=True) + raise ProcessingError(self.filename, exc) + + + # t = prettytable.PrettyTable(('filename', 'type', 'macros', 'autoexec', 'suspicious', 'ioc', 'hexstrings'), + # header=False, border=False) + # t.align = 'l' + # t.max_width['filename'] = 30 + # t.max_width['type'] = 10 + # t.max_width['macros'] = 6 + # t.max_width['autoexec'] = 6 + # t.max_width['suspicious'] = 6 + # t.max_width['ioc'] = 6 + # t.max_width['hexstrings'] = 6 + # t.add_row((filename, ftype, macros, autoexec, suspicious, iocs, hexstrings)) + # print t + + +#=== MAIN ===================================================================== + +def main(): + """ + Main function, called when olevba is run from the command line + """ + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + # parser.add_option('-o', '--outfile', dest='outfile', + # help='output file') + # parser.add_option('-c', '--csv', dest='csv', + # help='export results to a CSV file') + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open all files from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + # output mode; could make this even simpler with add_option(type='choice') but that would make + # cmd line interface incompatible... + modes = optparse.OptionGroup(parser, title='Output mode (mutually exclusive)') + modes.add_option("-t", '--triage', action="store_const", dest="output_mode", + const='triage', default='unspecified', + help='triage mode, display results as a summary table (default for multiple files)') + modes.add_option("-d", '--detailed', action="store_const", dest="output_mode", + const='detailed', default='unspecified', + help='detailed mode, display full results (default for single file)') + modes.add_option("-j", '--json', action="store_const", dest="output_mode", + const='json', default='unspecified', + help='json mode, detailed in json format (never default)') + parser.add_option_group(modes) + parser.add_option("-a", '--analysis', action="store_false", dest="display_code", default=True, + help='display only analysis results, not the macro source code') + parser.add_option("-c", '--code', action="store_true", dest="vba_code_only", default=False, + help='display only VBA source code, do not analyze it') + parser.add_option("--decode", action="store_true", dest="show_decoded_strings", + help='display all the obfuscated strings with their decoded content (Hex, Base64, StrReverse, Dridex, VBA).') + parser.add_option("--attr", action="store_false", dest="hide_attributes", default=True, + help='display the attribute lines at the beginning of VBA source code') + parser.add_option("--reveal", action="store_true", dest="show_deobfuscated_code", + help='display the macro source code after replacing all the obfuscated strings by their decoded content.') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + parser.add_option('--deobf', dest="deobfuscate", action="store_true", default=False, + help="Attempt to deobfuscate VBA expressions (slow)") + parser.add_option('--relaxed', dest="relaxed", action="store_true", default=False, + help="Do not raise errors if opening of substream fails") + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print(__doc__) + parser.print_help() + sys.exit(RETURN_WRONG_ARGS) + + # provide info about tool and its version + if options.output_mode == 'json': + # prints opening [ + print_json(script_name='olevba', version=__version__, + url='http://decalage.info/python/oletools', + type='MetaInformation') + else: + print('olevba %s - http://decalage.info/python/oletools' % __version__) + + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + + # Old display with number of items detected: + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('Type', 'Macros', 'AutoEx', 'Susp.', 'IOCs', 'HexStr') + # print '%-8s %-7s %-7s %-7s %-7s %-7s' % ('-'*8, '-'*7, '-'*7, '-'*7, '-'*7, '-'*7) + + # with the option --reveal, make sure --deobf is also enabled: + if options.show_deobfuscated_code and not options.deobfuscate: + log.info('set --deobf because --reveal was set') + options.deobfuscate = True + if options.output_mode == 'triage' and options.show_deobfuscated_code: + log.info('ignoring option --reveal in triage output mode') + + # Column headers (do not know how many files there will be yet, so if no output_mode + # was specified, we will print triage for first file --> need these headers) + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %-65s' % ('Flags', 'Filename')) + print('%-12s %-65s' % ('-' * 11, '-' * 65)) + + previous_container = None + count = 0 + container = filename = data = None + vba_parser = None + return_code = RETURN_OK + try: + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + + # handle errors from xglob + if isinstance(data, Exception): + if isinstance(data, PathNotFoundException): + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - File not found' % ('?', filename)) + elif options.output_mode != 'json': + log.error('Given path %r does not exist!' % filename) + return_code = RETURN_FILE_NOT_FOUND if return_code == 0 \ + else RETURN_SEVERAL_ERRS + else: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - Failed to read from zip file %s' % ('?', filename, container)) + elif options.output_mode != 'json': + log.error('Exception opening/reading %r from zip file %r: %s' + % (filename, container, data)) + return_code = RETURN_XGLOB_ERR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + if options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(data).__name__, message=str(data)) + continue + + try: + # Open the file + vba_parser = VBA_Parser_CLI(filename, data=data, container=container, + relaxed=options.relaxed) + + if options.output_mode == 'detailed': + # fully detailed output + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, + display_code=options.display_code, + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, + show_deobfuscated_code=options.show_deobfuscated_code, + deobfuscate=options.deobfuscate) + elif options.output_mode in ('triage', 'unspecified'): + # print container name when it changes: + if container != previous_container: + if container is not None: + print('\nFiles in %s:' % container) + previous_container = container + # summarized output for triage: + vba_parser.process_file_triage(show_decoded_strings=options.show_decoded_strings, + deobfuscate=options.deobfuscate) + elif options.output_mode == 'json': + print_json( + vba_parser.process_file_json(show_decoded_strings=options.show_decoded_strings, + display_code=options.display_code, + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, + show_deobfuscated_code=options.show_deobfuscated_code, + deobfuscate=options.deobfuscate)) + else: # (should be impossible) + raise ValueError('unexpected output mode: "{0}"!'.format(options.output_mode)) + count += 1 + + except (SubstreamOpenError, UnexpectedDataError) as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - Error opening substream or uenxpected ' \ + 'content' % ('?', filename)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, message=str(exc)) + else: + log.exception('Error opening substream or unexpected ' + 'content in %s' % filename) + return_code = RETURN_OPEN_ERROR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + except FileOpenError as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - File format not supported' % ('?', filename)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, message=str(exc)) + else: + log.exception('Failed to open %s -- probably not supported!' % filename) + return_code = RETURN_OPEN_ERROR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + except ProcessingError as exc: + if options.output_mode in ('triage', 'unspecified'): + print('%-12s %s - %s' % ('!ERROR', filename, exc.orig_exc)) + elif options.output_mode == 'json': + print_json(file=filename, type='error', + error=type(exc).__name__, + message=str(exc.orig_exc)) + else: + log.exception('Error processing file %s (%s)!' + % (filename, exc.orig_exc)) + return_code = RETURN_PARSE_ERROR if return_code == 0 \ + else RETURN_SEVERAL_ERRS + finally: + if vba_parser is not None: + vba_parser.close() + + if options.output_mode == 'triage': + print('\n(Flags: OpX=OpenXML, XML=Word2003XML, MHT=MHTML, TXT=Text, M=Macros, ' \ + 'A=Auto-executable, S=Suspicious keywords, I=IOCs, H=Hex strings, ' \ + 'B=Base64 strings, D=Dridex strings, V=VBA strings, ?=Unknown)\n') + + if count == 1 and options.output_mode == 'unspecified': + # if options -t, -d and -j were not specified and it's a single file, print details: + vba_parser.process_file(show_decoded_strings=options.show_decoded_strings, + display_code=options.display_code, + hide_attributes=options.hide_attributes, vba_code_only=options.vba_code_only, + show_deobfuscated_code=options.show_deobfuscated_code, + deobfuscate=options.deobfuscate) + + if options.output_mode == 'json': + # print last json entry (a last one without a comma) and closing ] + print_json(type='MetaInformation', return_code=return_code, + n_processed=count, _json_is_last=True) + + except Exception as exc: + # some unexpected error, maybe some of the types caught in except clauses + # above were not sufficient. This is very bad, so log complete trace at exception level + # and do not care about output mode + log.exception('Unhandled exception in main: %s' % exc, exc_info=True) + return_code = RETURN_UNEXPECTED # even if there were others before -- this is more important + # TODO: print msg with URL to report issues (except in JSON mode) + + # done. exit + log.debug('will exit now with code %s' % return_code) + sys.exit(return_code) + +if __name__ == '__main__': + main() + +# This was coded while listening to "Dust" from I Love You But I've Chosen Darkness diff -Nru remnux-oletools-0.51a/remnux-oletools/ppt_parser.py remnux-oletools-0.51a/remnux-oletools/ppt_parser.py --- remnux-oletools-0.51a/remnux-oletools/ppt_parser.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/ppt_parser.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,1583 @@ +""" Parse a ppt (MS PowerPoint 97-2003) file + +Based on olefile, parse the ppt-specific info + +Code much influenced by olevba._extract_vba but much more object-oriented +(possibly slightly excessively so) + +Currently quite narrowly focused on extracting VBA from ppt files, no slides or +stuff, but built to be extended to parsing more/all of the file + +References: +* https://msdn.microsoft.com/en-us/library/dd921564%28v=office.12%29.aspx + and links there-in +""" + +# === LICENSE ================================================================= +# TODO +#------------------------------------------------------------------------------ +# TODO: +# - make stream optional in PptUnexpectedData +# - can speed-up by using less bigger struct.parse calls? +# - license +# - make buffered stream from output of iterative_decompress +# - maybe can merge the 2 decorators into 1? (with_opened_main_stream) +# +# CHANGELOG: +# 2016-05-04 v0.01 CH: - start parsing "Current User" stream +# 2016-07-20 v0.50 SL: - added Python 3 support +# 2016-09-13 PL: - fixed olefile import for Python 2+3 +# - fixed format strings for Python 2.6 (issue #75) + +__version__ = '0.50' + + +# --- IMPORTS ------------------------------------------------------------------ + +import sys +import logging +import struct +import traceback +import os + +try: + # absolute import when oletools is installed + import oletools.thirdparty.olefile as olefile +except: + # relative import otherwise + import thirdparty.olefile as olefile + +import zlib + + +# a global logger object used for debugging: +log = olefile.get_logger('ppt') + + +def enable_logging(): + """ + Enable logging for this module (disabled by default). + This will set the module-specific logger level to NOTSET, which + means the main application controls the actual logging level. + """ + log.setLevel(logging.NOTSET) + + +#--- CONSTANTS ---------------------------------------------------------------- + +# name of main stream +MAIN_STREAM_NAME = 'PowerPoint Document' + +# URL and message to report issues: +URL_OLEVBA_ISSUES = 'https://bitbucket.org/decalage/oletools/issues' +MSG_OLEVBA_ISSUES = 'Please report this issue on %s' % URL_OLEVBA_ISSUES + + +# === EXCEPTIONS ============================================================== + + +class PptUnexpectedData(Exception): + """ raise by PptParser if some field's value is not as expected """ + def __init__(self, stream, field_name, found_value, expected_value): + self.msg = \ + 'In stream "{0}" for field "{1}" found value "{2}" but expected {3}!' \ + .format(stream, field_name, found_value, expected_value) + super(PptUnexpectedData, self).__init__(self.msg) + + +# === HELPERS ================================================================= + +def read_1(stream): + """ read 1 byte from stream """ + return struct.unpack('= expect_upper: + is_err = True + + if is_err: + clz_name = self.__class__.__name__ + if expect_lower is None: + expect_str = '< {0:04X}'.format(expect_upper) + elif expect_upper is None: + expect_str = '> {0:04X}'.format(expect_lower) + else: + expect_str = 'within ({0:04X}, {1:04X})'.format(expect_lower, + expect_upper) + return [PptUnexpectedData(self.stream_name, clz_name + '.' + name, + '{0:04X}'.format(value), expect_str), ] + else: + return [] + + def check_rec_head(self, length=None): + """ to be called by check_validity to check the self.rec_head + + uses self.RECORD_... constants, (not quite that constant for DummyType) + """ + + errs = [] + errs.extend(self.check_value('rec_head.recVer', self.rec_head.rec_ver, + self.RECORD_VERSION)) + errs.extend(self.check_value('rec_head.recInstance', + self.rec_head.rec_instance, + self.RECORD_INSTANCE)) + if self.RECORD_TYPE is None: + raise NotImplementedError('RECORD_TYPE not specified!') + errs.extend(self.check_value('rec_head.recType', + self.rec_head.rec_type, + self.RECORD_TYPE)) + if length is not None: + errs.extend(self.check_value('rec_head.recLen', + self.rec_head.rec_len, length)) + return errs + + @classmethod + def generate_pattern(clz, rec_len=None): + """ call RecordHeader.generate with values for this type """ + return RecordHeader.generate(clz.RECORD_TYPE, rec_len, + clz.RECORD_INSTANCE, clz.RECORD_VERSION) + + +class CurrentUserAtom(PptType): + """ An atom record that specifies information about the last user to modify + the file and where the most recent user edit is located. This is the only + record in the Current User Stream (section 2.1.1). + + https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx + """ + + # allowed values for header_token + HEADER_TOKEN_ENCRYPT = 0xF3D1C4DF + HEADER_TOKEN_NOCRYPT = 0xE391C05F + + # allowed values for rel_version + REL_VERSION_CAN_USE = 0x00000008 + REL_VERSION_NO_USE = 0x00000009 + + # required values + RECORD_TYPE = 0x0FF6 + SIZE = 0x14 + DOC_FILE_VERSION = 0x03F4 + MAJOR_VERSION = 0x03 + MINOR_VERSION = 0x00 + + def __init__(self): + super(CurrentUserAtom, self).__init__(stream_name='Current User') + self.rec_head = None + self.size = None + self.header_token = None + self.offset_to_current_edit = None + self.len_user_name = None + self.doc_file_version = None + self.major_version = None + self.minor_version = None + self.ansi_user_name = None + self.unicode_user_name = None + self.rel_version = None + + def is_encrypted(self): + return self.header_token == self.HEADER_TOKEN_ENCRYPT + + @classmethod + def extract_from(clz, stream): + """ create instance with info from stream """ + + obj = clz() + + # parse record header + obj.rec_head = RecordHeader.extract_from(stream) + + obj.size, = struct.unpack('= offset: + errs.append(PptUnexpectedData( + 'PowerPoint Document', 'UserEditAtom.offsetLastEdit', + self.offset_last_edit, '< {0}'.format(offset))) + if self.offset_persist_directory >= offset or \ + self.offset_persist_directory <= self.offset_last_edit: + errs.append(PptUnexpectedData( + 'PowerPoint Document', + 'UserEditAtom.offsetPersistDirectory', + self.offset_last_edit, + 'in ({0}, {1})'.format(self.offset_last_edit, offset))) + errs.extend(self.check_value('docPersistIdRef', + self.doc_persist_id_ref, 1)) + return errs + + # TODO: offer to check persist_id_seed given PersistDirectoryAtom) + + +class DummyType(PptType): + """ a type that is found in ppt documents we are not interested in + + instead of parsing many uninteresting types, we just read their + RecordHeader and set the RECORD_... values on an instance- (instead of + class-) level + + used to skip over uninteresting types in e.g. DocumentContainer + """ + + def __init__(self, type_name, record_type, rec_ver=0, rec_instance=0, + rec_len=None): + super(DummyType, self).__init__() + self.type_name = type_name + self.RECORD_TYPE = record_type + self.RECORD_VERSION = rec_ver + self.RECORD_INSTANCE = rec_instance + self.record_length = rec_len + + def extract_from(self, stream): + """ extract record header and just skip as many bytes as header says + + Since this requires RECORD_... values set in constructor, this is NOT + a classmethod like all the other extract_from! + + Otherwise this tries to be compatible with other extract_from methods + (e.g. returns self) + """ + self.read_rec_head(stream) + log.debug('skipping over {0} Byte for type {1}' + .format(self.rec_head.rec_len, self.type_name)) + log.debug('start at pos {0}'.format(stream.tell())) + stream.seek(self.rec_head.rec_len, os.SEEK_CUR) + log.debug('now at pos {0}'.format(stream.tell())) + return self + + def check_validity(self): + return self.check_rec_head(self.record_length) + + +class PersistDirectoryAtom(PptType): + """ one part of a persist object directory with unique persist object id + + contains PersistDirectoryEntry objects + + https://msdn.microsoft.com/en-us/library/dd952680%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x1772 + + def __init__(self): + super(PersistDirectoryAtom, self).__init__() + self.rg_persist_dir_entry = None # actually, this will be an array + self.stream_offset = None + + @classmethod + def extract_from(clz, stream): + """ create and return object with data from given stream """ + + log.debug("Extracting a PersistDirectoryAtom from stream") + obj = clz() + + # remember own offset for checking validity + obj.stream_offset = stream.tell() + + # parse record header + obj.read_rec_head(stream) + + # read directory entries from list until reach size for this object + curr_pos = stream.tell() + stop_pos = curr_pos + obj.rec_head.rec_len + log.debug('start reading at pos {0}, read until {1}' + .format(curr_pos, stop_pos)) + obj.rg_persist_dir_entry = [] + + while curr_pos < stop_pos: + new_entry = PersistDirectoryEntry.extract_from(stream) + obj.rg_persist_dir_entry.append(new_entry) + curr_pos = stream.tell() + log.debug('at pos {0}'.format(curr_pos)) + return obj + + def check_validity(self, user_edit_last_offset=None): + errs = self.check_rec_head() + for entry in self.rg_persist_dir_entry: + errs.extend(entry.check_validity(user_edit_last_offset, + self.stream_offset)) + return errs + + +class PersistDirectoryEntry(object): + """ an entry contained in a PersistDirectoryAtom.rg_persist_dir_entry + + A structure that specifies a compressed table of sequential persist object + identifiers and stream offsets to associated persist objects. + + NOT a subclass of PptType because has no RecordHeader + + https://msdn.microsoft.com/en-us/library/dd947347%28v=office.12%29.aspx + """ + + def __init__(self): + self.persist_id = None + self.c_persist = None + self.rg_persist_offset = None + + @classmethod + def extract_from(clz, stream): + # take a 4-byte (=32bit) number, divide into 20bit and 12 bit) + log.debug("Extracting a PersistDirectoryEntry from stream") + obj = clz() + + # persistId (20 bits): An unsigned integer that specifies a starting + # persist object identifier. It MUST be less than or equal to 0xFFFFE. + # The first entry in rgPersistOffset is associated with persistId. The + # next entry, if present, is associated with persistId plus 1. Each + # entry in rgPersistOffset is associated with a persist object + # identifier in this manner, with the final entry associated with + # persistId + cPersist - 1. + + # cPersist (12 bits): An unsigned integer that specifies the count of + # items in rgPersistOffset. It MUST be greater than or equal to 0x001. + temp, = struct.unpack(' id is {1}, reading {2} offsets' + .format(temp, obj.persist_id, obj.c_persist)) + + # rgPersistOffset (variable): An array of PersistOffsetEntry (section + # 2.3.6) that specifies stream offsets to persist objects. The count of + # items in the array is specified by cPersist. The value of each item + # MUST be greater than or equal to offsetLastEdit in the corresponding + # user edit and MUST be less than the offset, in bytes, of the + # corresponding persist object directory. + # PersistOffsetEntry: An unsigned 4-byte integer that specifies an + # offset, in bytes, from the beginning of the PowerPoint Document + # Stream (section 2.1.2) to a persist object. + obj.rg_persist_offset = [struct.unpack(' 0xFFFFE: # (--> == 0xFFFFF since 20bit) + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.persist_id', + self.persist_id, '< 0xFFFFE (dec: {0})'.format(0xFFFFE))) + if self.c_persist == 0: + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.c_persist', + self.c_persist, '> 0')) + if user_edit_last_offset is not None \ + and min(self.rg_persist_offset) < user_edit_last_offset: + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset', + min(self.rg_persist_offset), + '> UserEdit.offsetLastEdit ({0})' + .format(user_edit_last_offset))) + if persist_obj_dir_offset is not None \ + and max(self.rg_persist_offset) > persist_obj_dir_offset: + errs.append(PptUnexpectedData( + MAIN_STREAM_NAME, 'PersistDirectoryEntry.rg_persist_offset', + max(self.rg_persist_offset), + '> PersistObjectDirectory offset ({0})' + .format(persist_obj_dir_offset))) + return errs + + +class DocInfoListSubContainerOrAtom(PptType): + """ one of various types found in a DocInfoListContainer + + https://msdn.microsoft.com/en-us/library/dd921705%28v=office.12%29.aspx + + actual type of this object is defined by the recVersion field in its Record + Head + + Similar to DummyType, RECORD_TYPE varies from instance to instance for this + type + """ + + # RECORD_TYPE varies, is specified only in extract_from + VALID_RECORD_TYPES = [0x1388, # self.RECORD_TYPE_PROG_TAGS, \ + 0x0414, # self.RECORD_TYPE_NORMAL_VIEW_SET_INFO_9, \ + 0x0413, # self.RECORD_TYPE_NOTES_TEXT_VIEW_INFO_9, \ + 0x0407, # self.RECORD_TYPE_OUTLINE_VIEW_INFO, \ + 0x03FA, # self.RECORD_TYPE_SLIDE_VIEW_INFO, \ + 0x0408] # self.RECORD_TYPE_SORTER_VIEW_INFO + + def __init__(self): + super(DocInfoListSubContainerOrAtom, self).__init__() + + @classmethod + def extract_from(clz, stream): + """ build instance with info read from stream """ + + log.debug('Parsing DocInfoListSubContainerOrAtom from stream') + + obj = clz() + obj.read_rec_head(stream) + if obj.rec_head.rec_type == VBAInfoContainer.RECORD_TYPE: + obj = VBAInfoContainer.extract_from(stream, obj.rec_head) + else: + log.debug('skipping over {0} Byte in DocInfoListSubContainerOrAtom' + .format(obj.rec_head.rec_len)) + log.debug('start at pos {0}'.format(stream.tell())) + stream.seek(obj.rec_head.rec_len, os.SEEK_CUR) + log.debug('now at pos {0}'.format(stream.tell())) + return obj + + def check_validity(self): + """ can be any of multiple types """ + self.check_value('rh.recType', self.rec_head.rec_type, + self.VALID_RECORD_TYPES) + + +class DocInfoListContainer(PptType): + """ information about the document and document display settings + + https://msdn.microsoft.com/en-us/library/dd926767%28v=office.12%29.aspx + """ + + RECORD_VERSION = 0xF + RECORD_TYPE = 0x07D0 + + def __init__(self): + super(DocInfoListContainer, self).__init__() + self.rg_child_rec = None + + @classmethod + def extract_from(clz, stream): + """ build instance with info read from stream """ + + log.debug('Parsing DocInfoListContainer from stream') + obj = clz() + obj.read_rec_head(stream) + + # rgChildRec (variable): An array of DocInfoListSubContainerOrAtom + # records (section 2.4.5) that specifies information about the document + # or how the document is displayed. The size, in bytes, of the array is + # specified by rh.recLen + curr_pos = stream.tell() + end_pos = curr_pos + obj.rec_head.rec_len + log.debug('start reading at pos {0}, will read until {1}' + .format(curr_pos, end_pos)) + obj.rg_child_rec = [] + + while curr_pos < end_pos: + new_obj = DocInfoListSubContainerOrAtom().extract_from(stream) + obj.rg_child_rec.append(new_obj) + curr_pos = stream.tell() + log.debug('now at pos {0}'.format(curr_pos)) + + log.debug('reached end pos {0} ({1}). stop reading DocInfoListContainer' + .format(end_pos, curr_pos)) + + def check_validity(self): + errs = self.check_rec_head() + for obj in self.rg_child_rec: + errs.extend(obj.check_validity()) + return errs + + +class DocumentContainer(PptType): + """ a DocumentContainer record + + https://msdn.microsoft.com/en-us/library/dd947357%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x03E8 + + def __init__(self): + super(DocumentContainer, self).__init__() + self.document_atom = None + self.ex_obj_list = None + self.document_text_info = None + self.sound_collection = None + self.drawing_group = None + self.master_list = None + self.doc_info_list = None + self.slide_hf = None + self.notes_hf = None + self.slide_list = None + self.notes_list = None + self.slide_show_doc_info = None + self.named_shows = None + self.summary = None + self.doc_routing_slip = None + self.print_options = None + self.rt_custom_table_styles_1 = None + self.end_document = None + self.rt_custom_table_styles_2 = None + + @classmethod + def extract_from(clz, stream): + """ created object with values from given stream + + stream is assumed to be positioned correctly + + this container contains lots of data we are not interested in. + """ + + log.debug('Parsing DocumentContainer from stream') + obj = clz() + + # parse record header + obj.read_rec_head(stream) + log.info('validity: {0} errs'.format(len(obj.check_rec_head()))) + + # documentAtom (48 bytes): A DocumentAtom record (section 2.4.2) that + # specifies size information for presentation slides and notes slides. + obj.document_atom = DummyType('DocumentAtom', 0x03E9, rec_ver=0x1, + rec_len=0x28).extract_from(stream) + log.info('validity: {0} errs' + .format(len(obj.document_atom.check_validity()))) + + # exObjList (variable): An optional ExObjListContainer record (section + # 2.10.1) that specifies the list of external objects in the document. + obj.ex_obj_list = DummyType('ExObjListContainer', 0x0409, rec_ver=0xF)\ + .extract_from(stream) + log.info('validity: {0} errs' + .format(len(obj.ex_obj_list.check_validity()))) + + # documentTextInfo (variable): A DocumentTextInfoContainer record + # (section 2.9.1) that specifies the default text styles for the + # document. + obj.document_text_info = DummyType('DocumentTextInfoContainer', 0x03F2, + rec_ver=0xF).extract_from(stream) + log.info('validity: {0} errs' + .format(len(obj.document_text_info.check_validity()))) + + # soundCollection (variable): An optional SoundCollectionContainer + # record (section 2.4.16.1) that specifies the list of sounds in the + # file. + obj.sound_collection = DummyType('SoundCollectionContainer', 0x07E4, + rec_ver=0xF, rec_instance=0x005)\ + .extract_from(stream) + log.info('validity: {0} errs' + .format(len(obj.sound_collection.check_validity()))) + + # drawingGroup (variable): A DrawingGroupContainer record (section + # 2.4.3) that specifies drawing information for the document. + obj.drawing_group = DummyType('DrawingGroupContainer', 0x040B, + rec_ver=0xF).extract_from(stream) + log.info('validity: {0} errs' + .format(len(obj.drawing_group.check_validity()))) + + # masterList (variable): A MasterListWithTextContainer record (section + # 2.4.14.1) that specifies the list of main master slides and title + # master slides. + obj.master_list = DummyType('MasterListWithContainer', 0x0FF0, + rec_ver=0xF).extract_from(stream) + log.info('validity: {0} errs' + .format(len(obj.master_list.check_validity()))) + + # docInfoList (variable): An optional DocInfoListContainer record + # (section 2.4.4) that specifies additional document information. + # this is the variable we are interested in! + obj.doc_info_list = DocInfoListContainer.extract_from(stream) + + # slideHF (variable): An optional SlideHeadersFootersContainer record + # (section 2.4.15.1) that specifies the default header and footer + # information for presentation slides. + #obj.slide_hf = None + + # notesHF (variable): An optional NotesHeadersFootersContainer record + # (section 2.4.15.6) that specifies the default header and footer + # information for notes slides. + #obj.notes_hf = None + + # slideList (variable): An optional SlideListWithTextContainer record + # (section 2.4.14.3) that specifies the list of presentation slides. + #obj.slide_list = None + + # notesList (variable): An optional NotesListWithTextContainer record + # (section 2.4.14.6) that specifies the list of notes slides. + #obj.notes_list = None + + # slideShowDocInfoAtom (88 bytes): An optional SlideShowDocInfoAtom + # record (section 2.6.1) that specifies slide show information for the + # document. + #obj.slide_show_doc_info = None + + # namedShows (variable): An optional NamedShowsContainer record + # (section 2.6.2) that specifies named shows in the document. + #obj.named_shows = None + + # summary (variable): An optional SummaryContainer record (section + # 2.4.22.3) that specifies bookmarks for the document. + #obj.summary = None + + # docRoutingSlipAtom (variable): An optional DocRoutingSlipAtom record + # (section 2.11.1) that specifies document routing information. + #obj.doc_routing_slip = None + + # printOptionsAtom (13 bytes): An optional PrintOptionsAtom record + # (section 2.4.12) that specifies default print options. + #obj.print_options = None + + # rtCustomTableStylesAtom1 (variable): An optional + # RoundTripCustomTableStyles12Atom record (section 2.11.13) that + # specifies round-trip information for custom table styles. + #obj.rt_custom_table_styles_1 = None + + # endDocumentAtom (8 bytes): An EndDocumentAtom record (section 2.4.13) + # that specifies the end of the information for the document. + #obj.end_document = None + + # rtCustomTableStylesAtom2 (variable): An optional + # RoundTripCustomTableStyles12Atom record that specifies round-trip + # information for custom table styles. It MUST NOT exist if + # rtCustomTableStylesAtom1 exists. + #obj.rt_custom_table_styles_2 = None + + return obj + + + def check_validity(self): + """ check all values in object for valid values """ + errs = self.check_rec_head() + errs.extend(self.document_atom.check_validity()) + errs.extend(self.ex_obj_list.check_validity()) + errs.extend(self.document_text_info.check_validity()) + errs.extend(self.sound_collection.check_validity()) + errs.extend(self.drawing_group.check_validity()) + errs.extend(self.master_list.check_validity()) + errs.extend(self.doc_info_list.check_validity()) + return errs + + +class VBAInfoContainer(PptType): + """ A container record that specifies VBA information for the document. + + https://msdn.microsoft.com/en-us/library/dd952168%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x03FF + RECORD_VERSION = 0xF + RECORD_INSTANCE = 0x001 + RECORD_LENGTH = 0x14 + + def __init__(self): + super(VBAInfoContainer, self).__init__() + self.vba_info_atom = None + + @classmethod + def extract_from(clz, stream, rec_head=None): + """ since can determine this type only after reading header, it is arg + """ + log.debug('parsing VBAInfoContainer') + obj = clz() + if rec_head is None: + obj.read_rec_head(stream) + else: + log.debug('skip parsing of RecordHeader') + obj.rec_head = rec_head + obj.vba_info_atom = VBAInfoAtom.extract_from(stream) + return obj + + def check_validity(self): + errs = self.check_rec_head(length=self.RECORD_LENGTH) + errs.extend(self.vba_info_atom.check_validity()) + return errs + + +class VBAInfoAtom(PptType): + """ An atom record that specifies a reference to the VBA project storage. + + https://msdn.microsoft.com/en-us/library/dd948874%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x0400 + RECORD_VERSION = 0x2 + RECORD_LENGTH = 0x0C + + def __init__(self): + super(VBAInfoAtom, self).__init__() + self.persist_id_ref = None + self.f_has_macros = None + self.version = None + + @classmethod + def extract_from(clz, stream): + log.debug('parsing VBAInfoAtom') + obj = clz() + obj.read_rec_head(stream) + + # persistIdRef (4 bytes): A PersistIdRef (section 2.2.21) that + # specifies the value to look up in the persist object directory to + # find the offset of a VbaProjectStg record (section 2.10.40). + obj.persist_id_ref = read_4(stream) + + # fHasMacros (4 bytes): An unsigned integer that specifies whether the + # VBA project storage contains data. It MUST be 0 (empty vba storage) + # or 1 (vba storage contains data) + obj.f_has_macros = read_4(stream) + + # version (4 bytes): An unsigned integer that specifies the VBA runtime + # version that generated the VBA project storage. It MUST be + # 0x00000002. + obj.version = read_4(stream) + + return obj + + def check_validity(self): + + errs = self.check_rec_head(length=self.RECORD_LENGTH) + + # must be 0 or 1: + errs.extend(self.check_range('fHasMacros', self.f_has_macros, None, 2)) + errs.extend(self.check_value('version', self.version, 2)) + return errs + + +class ExternalObjectStorage(PptType): + """ storage for compressed/uncompressed OLE/VBA/ActiveX control data + + Matches types ExOleObjStgCompressedAtom, ExOleObjStgUncompressedAtom, + VbaProjectStgCompressedAtom, VbaProjectStgUncompressedAtom, + ExControlStgUncompressedAtom, ExControlStgCompressedAtom + + Difference between compressed and uncompressed: RecordHeader.rec_instance + is 0 or 1, first variable after RecordHeader is decompressed_size + + Data is not read at first, only its offset in the stream and size is saved + + e.g. + https://msdn.microsoft.com/en-us/library/dd952169%28v=office.12%29.aspx + """ + + RECORD_TYPE = 0x1011 + RECORD_INSTANCE_COMPRESSED = 1 + RECORD_INSTANCE_UNCOMPRESSED = 0 + + def __init__(self, is_compressed=None): + super(ExternalObjectStorage, self).__init__() + if is_compressed is None: + self.RECORD_INSTANCE = None # otherwise defaults to 0 + elif is_compressed: + self.RECORD_INSTANCE = self.RECORD_INSTANCE_COMPRESSED + self.is_compressed = True + else: + self.RECORD_INSTANCE = self.RECORD_INSTANCE_UNCOMPRESSED + self.is_compressed = False + self.uncompressed_size = None + self.data_offset = None + self.data_size = None + + def extract_from(self, stream): + """ not a classmethod because of is_compressed attrib + + see also: DummyType + """ + log.debug('Parsing ExternalObjectStorage (compressed={0}) from stream' + .format(self.is_compressed)) + self.read_rec_head(stream) + self.data_size = self.rec_head.rec_len + if self.is_compressed: + self.uncompressed_size = read_4(stream) + self.data_size -= 4 + self.data_offset = stream.tell() + + def check_validity(self): + return self.check_rec_head() + + +class ExternalObjectStorageUncompressed(ExternalObjectStorage): + """ subclass of ExternalObjectStorage for uncompressed objects """ + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_UNCOMPRESSED + + def __init__(self): + super(ExternalObjectStorageUncompressed, self).__init__(False) + + @classmethod + def extract_from(clz, stream): + """ note the usage of super here: call instance method of super class! + """ + obj = clz() + super(ExternalObjectStorageUncompressed, obj).extract_from(stream) + return obj + + +class ExternalObjectStorageCompressed(ExternalObjectStorage): + """ subclass of ExternalObjectStorage for compressed objects """ + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_COMPRESSED + + def __init__(self): + super(ExternalObjectStorageCompressed, self).__init__(True) + + @classmethod + def extract_from(clz, stream): + """ note the usage of super here: call instance method of super class! + """ + obj = clz() + super(ExternalObjectStorageCompressed, obj).extract_from(stream) + return obj + + +# === PptParser =============================================================== + +def with_opened_main_stream(func): + """ a decorator that can open and close the default stream for func + + to be applied only to functions in PptParser that read from default stream + (:py:data:`MAIN_STREAM_NAME`) + + Decorated functions need to accept args (self, stream, ...) + """ + + def wrapped(self, *args, **kwargs): + # remember who opened the stream so that function also closes it + stream_opened_by_me = False + try: + # open stream if required + if self._open_main_stream is None: + log.debug('opening stream {0!r} for {1}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME) + stream_opened_by_me = True + + # run wrapped function + return func(self, self._open_main_stream, *args, **kwargs) + + # error handling + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() + finally: + # ensure stream is closed by the one who opened it (even if error) + if stream_opened_by_me: + log.debug('closing stream {0!r} after {1}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream.close() + self._open_main_stream = None + return wrapped + + +def generator_with_opened_main_stream(func): + """ same as with_opened_main_stream but with yield instead of return """ + + def wrapped(self, *args, **kwargs): + # remember who opened the stream so that function also closes it + stream_opened_by_me = False + try: + # open stream if required + if self._open_main_stream is None: + log.debug('opening stream {0!r} for {1}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME) + stream_opened_by_me = True + + # run actual function + for result in func(self, self._open_main_stream, *args, **kwargs): + yield result + + # error handling + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() + finally: + # ensure stream is closed by the one who opened it (even if error) + if stream_opened_by_me: + log.debug('closing stream {0!r} after {1}' + .format(MAIN_STREAM_NAME, func.__name__)) + self._open_main_stream.close() + self._open_main_stream = None + return wrapped + + +class PptParser(object): + """ Parser for PowerPoint 97-2003 specific data structures + + requires an OleFileIO + """ + + def __init__(self, ole, fast_fail=False): + """ constructor + + :param ole: OleFileIO or anything that OleFileIO constructor accepts + :param bool fast_fail: if True, all unexpected data will raise a + PptUnexpectedData; if False will only log error + """ + if isinstance(ole, olefile.OleFileIO): + log.debug('using open OleFileIO') + self.ole = ole + else: + log.debug('Opening file ' + ole) + self.ole = olefile.OleFileIO(ole) + + self.fast_fail = fast_fail + + self.current_user_atom = None + self.newest_user_edit = None + self.document_persist_obj = None + self.persist_object_directory = None + + # basic compatibility check: root directory structure is + # [['\x05DocumentSummaryInformation'], + # ['\x05SummaryInformation'], + # ['Current User'], + # ['PowerPoint Document']] + root_streams = self.ole.listdir() + #for stream in root_streams: + # log.debug('found root stream {0!r}'.format(stream)) + if any(len(stream) != 1 for stream in root_streams): + self._fail('root', 'listdir', root_streams, 'len = 1') + root_streams = [stream[0].lower() for stream in root_streams] + if not 'current user' in root_streams: + self._fail('root', 'listdir', root_streams, 'Current User') + if not MAIN_STREAM_NAME.lower() in root_streams: + self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME) + + self._open_main_stream = None + + def _log_exception(self, msg=None): + """ log an exception instead of raising it + + call in one of 2 ways: + try: + if fail(): + self._log_exception('this is the message') + except: + self._log_exception() # only possible in except clause + """ + if msg is not None: + stack = traceback.extract_stack()[:-1] + else: + _, exc, trace = sys.exc_info() + stack = traceback.extract_tb(trace) + msg = str(exc) + log.error(msg) + + for i_entry, entry in enumerate(traceback.format_list(stack)): + for line in entry.splitlines(): + log.debug('trace {0}: {1}'.format(i_entry, line)) + + def _fail(self, *args): + """ depending on self.fast_fail raise PptUnexpectedData or just log err + + args as for PptUnexpectedData + """ + if self.fast_fail: + raise PptUnexpectedData(*args) + else: + self._log_exception(PptUnexpectedData(*args).msg) + + def parse_current_user(self): + """ parse the CurrentUserAtom record from stream 'Current User' + + Structure described in + https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx + """ + + if self.current_user_atom is not None: + log.warning('re-reading and overwriting ' + 'previously read current_user_atom') + + log.debug('parsing "Current User"') + + stream = None + try: + log.debug('opening stream "Current User"') + stream = self.ole.openstream('Current User') + self.current_user_atom = CurrentUserAtom.extract_from(stream) + except Exception: + if self.fast_fail: + raise + else: + self._log_exception() + finally: + if stream is not None: + log.debug('closing stream "Current User"') + stream.close() + + @with_opened_main_stream + def parse_persist_object_directory(self, stream): + """ Part 1: Construct the persist object directory """ + + if self.persist_object_directory is not None: + log.warning('re-reading and overwriting ' + 'previously read persist_object_directory') + + # Step 1: Read the CurrentUserAtom record (section 2.3.2) from the + # Current User Stream (section 2.1.1). All seek operations in the steps + # that follow this step are in the PowerPoint Document Stream. + if self.current_user_atom is None: + self.parse_current_user() + + offset = self.current_user_atom.offset_to_current_edit + is_encrypted = self.current_user_atom.is_encrypted() + self.persist_object_directory = {} + self.newest_user_edit = None + + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. + while offset != 0: + + # Step 2: Seek, in the PowerPoint Document Stream, to the + # offset specified by the offsetToCurrentEdit field of the + # CurrentUserAtom record identified in step 1. + stream.seek(offset, os.SEEK_SET) + + # Step 3: Read the UserEditAtom record at the current offset. + # Let this record be a live record. + user_edit = UserEditAtom.extract_from(stream, is_encrypted) + if self.newest_user_edit is None: + self.newest_user_edit = user_edit + + log.debug('checking validity') + errs = user_edit.check_validity() + if errs: + log.warning('check_validity found {0} issues' + .format(len(errs))) + for err in errs: + log.warning('UserEditAtom.check_validity: {0}'.format(err)) + if errs and self.fast_fail: + raise errs[0] + + # Step 4: Seek to the offset specified by the + # offsetPersistDirectory field of the UserEditAtom record + # identified in step 3. + log.debug('seeking to pos {0}' + .format(user_edit.offset_persist_directory)) + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) + + # Step 5: Read the PersistDirectoryAtom record at the current + # offset. Let this record be a live record. + persist_dir_atom = PersistDirectoryAtom.extract_from(stream) + + log.debug('checking validity') + errs = persist_dir_atom.check_validity(offset) + if errs: + log.warning('check_validity found {0} issues' + .format(len(errs))) + for err in errs: + log.warning('PersistDirectoryAtom.check_validity: {0}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + + + # Construct the complete persist object directory for this file + # as follows: + # - For each PersistDirectoryAtom record previously identified + # in step 5, add the persist object identifier and persist + # object stream offset pairs to the persist object directory + # starting with the PersistDirectoryAtom record last + # identified, that is, the one closest to the beginning of the + # stream. + # - Continue adding these pairs to the persist object directory + # for each PersistDirectoryAtom record in the reverse order + # that they were identified in step 5; that is, the pairs from + # the PersistDirectoryAtom record closest to the end of the + # stream are added last. + # - When adding a new pair to the persist object directory, if + # the persist object identifier already exists in the persist + # object directory, the persist object stream offset from the + # new pair replaces the existing persist object stream offset + # for that persist object identifier. + for entry in persist_dir_atom.rg_persist_dir_entry: + last_id = entry.persist_id+len(entry.rg_persist_offset)-1 + log.debug('for persist IDs {0}-{1}, save offsets {2}' + .format(entry.persist_id, last_id, + entry.rg_persist_offset)) + for count, offset in enumerate(entry.rg_persist_offset): + self.persist_object_directory[entry.persist_id+count] \ + = offset + + # check for more + # Step 6: Seek to the offset specified by the offsetLastEdit + # field in the UserEditAtom record identified in step 3. + offset = user_edit.offset_last_edit + + @with_opened_main_stream + def parse_document_persist_object(self, stream): + """ Part 2: Identify the document persist object """ + if self.document_persist_obj is not None: + log.warning('re-reading and overwriting ' + 'previously read document_persist_object') + + # Step 1: Read the docPersistIdRef field of the UserEditAtom record + # first identified in step 3 of Part 1, that is, the UserEditAtom + # record closest to the end of the stream. + if self.persist_object_directory is None: + self.parse_persist_object_directory() + + # Step 2: Lookup the value of the docPersistIdRef field in the persist + # object directory constructed in step 8 of Part 1 to find the stream + # offset of a persist object. + newest_ref = self.newest_user_edit.doc_persist_id_ref + offset = self.persist_object_directory[newest_ref] + log.debug('newest user edit ID is {0}, offset is {1}' + .format(newest_ref, offset)) + + # Step 3: Seek to the stream offset specified in step 2. + log.debug('seek to {0}'.format(offset)) + stream.seek(offset, os.SEEK_SET) + + # Step 4: Read the DocumentContainer record at the current offset. + # Let this record be a live record. + self.document_persist_obj = DocumentContainer.extract_from(stream) + + log.debug('checking validity') + errs = self.document_persist_obj.check_validity() + if errs: + log.warning('check_validity found {0} issues'.format(len(errs))) + for err in errs: + log.warning('check_validity(document_persist_obj): {0}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + + #-------------------------------------------------------------------------- + # 2nd attempt: do not parse whole structure but search through stream and + # yield results as they become available + # Keep in mind that after every yield the stream position may be anything! + + @generator_with_opened_main_stream + def search_pattern(self, stream, pattern): + """ search for pattern in stream, return indices """ + + BUF_SIZE = 1024 + + pattern_len = len(pattern) + log.debug('pattern length is {0}'.format(pattern_len)) + if pattern_len > BUF_SIZE: + raise ValueError('need buf > pattern to search!') + + n_reads = 0 + while True: + start_pos = stream.tell() + n_reads += 1 + #log.debug('read {0} starting from {1}' + # .format(BUF_SIZE, start_pos)) + buf = stream.read(BUF_SIZE) + idx = buf.find(pattern) + while idx != -1: + log.debug('found pattern at index {0}'.format(start_pos+idx)) + yield start_pos + idx + idx = buf.find(pattern, idx+1) + + if len(buf) == BUF_SIZE: + # move back a bit to avoid splitting of pattern through buf + stream.seek(start_pos + BUF_SIZE - pattern_len, os.SEEK_SET) + else: + log.debug('reached end of buf (read {0}<{1}) after {2} reads' + .format(len(buf), BUF_SIZE, n_reads)) + break + + @generator_with_opened_main_stream + def search_vba_info(self, stream): + """ search through stream for VBAInfoContainer, alternative to parse... + + quick-and-dirty: do not parse everything, just look for right bytes + + "quick" here means quick to program. Runtime now is linear is document + size (--> for big documents the other method might be faster) + + .. seealso:: search_vba_storage + """ + + logging.debug('looking for VBA info containers') + + pattern = VBAInfoContainer.generate_pattern( + rec_len=VBAInfoContainer.RECORD_LENGTH) \ + + VBAInfoAtom.generate_pattern( + rec_len=VBAInfoAtom.RECORD_LENGTH) + + # try parse + for idx in self.search_pattern(pattern): + # assume that in stream at idx there is a VBAInfoContainer + stream.seek(idx) + log.debug('extracting at idx {0}'.format(idx)) + try: + container = VBAInfoContainer.extract_from(stream) + except Exception: + self._log_exception() + continue + + errs = container.check_validity() + if errs: + log.warning('check_validity found {0} issues' + .format(len(errs))) + else: + log.debug('container is ok') + atom = container.vba_info_atom + log.debug('persist id ref is {0}, has_macros {1}, version {2}' + .format(atom.persist_id_ref, atom.f_has_macros, + atom.version)) + yield container + for err in errs: + log.warning('check_validity(VBAInfoContainer): {0}' + .format(err)) + if errs and self.fast_fail: + raise errs[0] + + @generator_with_opened_main_stream + def search_vba_storage(self, stream): + """ search through stream for VBAProjectStg, alternative to parse... + + quick-and-dirty: do not parse everything, just look for right bytes + + "quick" here means quick to program. Runtime now is linear is document + size (--> for big documents the other method might be faster) + + The storages found could also contain (instead of VBA data): ActiveX + data or general OLE data + + yields results as it finds them + + .. seealso:: :py:meth:`search_vba_info` + """ + + logging.debug('looking for VBA storage objects') + for obj_type in (ExternalObjectStorageUncompressed, + ExternalObjectStorageCompressed): + # re-position stream at start + stream.seek(0, os.SEEK_SET) + + pattern = obj_type.generate_pattern() + + # try parse + for idx in self.search_pattern(pattern): + # assume a ExternalObjectStorage in stream at idx + stream.seek(idx) + log.debug('extracting at idx {0}'.format(idx)) + try: + storage = obj_type.extract_from(stream) + except Exception: + self._log_exception() + continue + + errs = storage.check_validity() + if errs: + log.warning('check_validity found {0} issues' + .format(len(errs))) + else: + log.debug('storage is ok; compressed={0}, size={1}, ' + 'size_decomp={2}' + .format(storage.is_compressed, + storage.rec_head.rec_len, + storage.uncompressed_size)) + yield storage + for err in errs: + log.warning('check_validity({0}): {1}' + .format(obj_type.__name__, err)) + if errs and self.fast_fail: + raise errs[0] + + @with_opened_main_stream + def decompress_vba_storage(self, stream, storage): + """ return decompressed data from search_vba_storage """ + + log.debug('decompressing storage for VBA OLE data stream ') + + # decompress iteratively; a zlib.decompress of all data + # failed with Error -5 (incomplete or truncated stream) + stream.seek(storage.data_offset, os.SEEK_SET) + decomp, n_read, err = \ + iterative_decompress(stream, storage.data_size) + log.debug('decompressed {0} to {1} bytes; found err: {2}' + .format(n_read, len(decomp), err)) + if err and self.fast_fail: + raise err + # otherwise try to continue with partial data + + return decomp + + ## create OleFileIO from decompressed data + #ole = olefile.OleFileIO(decomp) + #root_streams = [entry[0].lower() for entry in ole.listdir()] + #for required in 'project', 'projectwm', 'vba': + # if required not in root_streams: + # raise ValueError('storage seems to not be a VBA storage ' + # '({0} not found in root streams)' + # .format(required)) + #log.debug('tests succeeded') + #return ole + + @with_opened_main_stream + def read_vba_storage_data(self, stream, storage): + """ return data pointed to by uncompressed storage """ + + log.debug('reading uncompressed VBA OLE data stream: ' + '{0} bytes starting at {1}' + .format(storage.data_size, storage.data_offset)) + stream.seek(storage.data_offset, os.SEEK_SET) + data = stream.read(storage.data_size) + return data + + @generator_with_opened_main_stream + def iter_vba_data(self, stream): + """ search vba infos and storages, yield uncompressed storage data """ + + n_infos = 0 + n_macros = 0 + for info in self.search_vba_info(): + n_infos += 1 + if info.vba_info_atom.f_has_macros > 0: + n_macros += 1 + # TODO: does it make sense at all to continue if n_macros == 0? + # --> no vba-info, so all storages probably ActiveX or other OLE + n_storages = 0 + n_compressed = 0 + for storage in self.search_vba_storage(): + n_storages += 1 + if storage.is_compressed: + n_compressed += 1 + yield self.decompress_vba_storage(storage) + else: + yield self.read_vba_storage_data(storage) + + log.info('found {0} infos ({1} with macros) and {2} storages ' + '({3} compressed)' + .format(n_infos, n_macros, n_storages, n_compressed)) + + +def iterative_decompress(stream, size, chunk_size=4096): + """ decompress data from stream chunk-wise """ + + decompressor = zlib.decompressobj() + n_read = 0 + decomp = '' + return_err = None + + try: + while n_read < size: + n_new = min(size-n_read, chunk_size) + decomp += decompressor.decompress(stream.read(n_new)) + n_read += n_new + except zlib.error as err: + return_err = err + + return decomp, n_read, return_err + + +if __name__ == '__main__': + print('nothing here to run!') diff -Nru remnux-oletools-0.51a/remnux-oletools/pyxswf.py remnux-oletools-0.51a/remnux-oletools/pyxswf.py --- remnux-oletools-0.51a/remnux-oletools/pyxswf.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/pyxswf.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,150 @@ +#!/usr/bin/env python +""" +pyxswf.py + +pyxswf is a script to detect, extract and analyze Flash objects (SWF) that may +be embedded in files such as MS Office documents (e.g. Word, Excel), +which is especially useful for malware analysis. + +pyxswf is an extension to xxxswf.py published by Alexander Hanel on +http://hooked-on-mnemonics.blogspot.nl/2011/12/xxxswfpy.html +Compared to xxxswf, it can extract streams from MS Office documents by parsing +their OLE structure properly (-o option), which is necessary when streams are +fragmented. +Stream fragmentation is a known obfuscation technique, as explained on +http://www.breakingpointsystems.com/resources/blog/evasion-with-ole2-fragmentation/ + +It can also extract Flash objects from RTF documents, by parsing embedded +objects encoded in hexadecimal format (-f option). + +pyxswf project website: http://www.decalage.info/python/pyxswf + +pyxswf is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================= + +# pyxswf is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2012-09-17 v0.01 PL: - first version +# 2012-11-09 v0.02 PL: - added RTF embedded objects extraction +# 2014-11-29 v0.03 PL: - use olefile instead of OleFileIO_PL +# - improved usage display with -h +# 2016-09-06 v0.50 PL: - updated to match the rtfobj API +# 2016-10-25 PL: - fixed print for Python 3 +# 2016-11-01 PL: - replaced StringIO by BytesIO for Python 3 + +__version__ = '0.50' + +#------------------------------------------------------------------------------ +# TODO: +# + update xxxswf to latest version +# + add support for LZMA-compressed flash files (ZWS header) +# references: http://blog.malwaretracker.com/2014/01/cve-2013-5331-evaded-av-by-using.html +# http://code.metager.de/source/xref/adobe/flash/crossbridge/tools/swf-info.py +# http://room32.dyndns.org/forums/showthread.php?766-SWFCompression +# sample code: http://room32.dyndns.org/SWFCompression.py +# - check if file is OLE +# - support -r + + +#=== IMPORTS ================================================================= + +import optparse, sys, os, rtfobj +from io import BytesIO +from thirdparty.xxxswf import xxxswf +import thirdparty.olefile as olefile + + +#=== MAIN ================================================================= + +def main(): + # print banner with version + print ('pyxswf %s - http://decalage.info/python/oletools' % __version__) + print ('Please report any issue at https://github.com/decalage2/oletools/issues') + print ('') + # Scenarios: + # Scan file for SWF(s) + # Scan file for SWF(s) and extract them + # Scan file for SWF(s) and scan them with Yara + # Scan file for SWF(s), extract them and scan with Yara + # Scan directory recursively for files that contain SWF(s) + # Scan directory recursively for files that contain SWF(s) and extract them + + usage = 'usage: %prog [options] ' + parser = optparse.OptionParser(usage=__doc__ + '\n' + usage) + parser.add_option('-x', '--extract', action='store_true', dest='extract', help='Extracts the embedded SWF(s), names it MD5HASH.swf & saves it in the working dir. No addition args needed') + parser.add_option('-y', '--yara', action='store_true', dest='yara', help='Scans the SWF(s) with yara. If the SWF(s) is compressed it will be deflated. No addition args needed') + parser.add_option('-s', '--md5scan', action='store_true', dest='md5scan', help='Scans the SWF(s) for MD5 signatures. Please see func checkMD5 to define hashes. No addition args needed') + parser.add_option('-H', '--header', action='store_true', dest='header', help='Displays the SWFs file header. No addition args needed') + parser.add_option('-d', '--decompress', action='store_true', dest='decompress', help='Deflates compressed SWFS(s)') + parser.add_option('-r', '--recdir', dest='PATH', type='string', help='Will recursively scan a directory for files that contain SWFs. Must provide path in quotes') + parser.add_option('-c', '--compress', action='store_true', dest='compress', help='Compresses the SWF using Zlib') + + parser.add_option('-o', '--ole', action='store_true', dest='ole', help='Parse an OLE file (e.g. Word, Excel) to look for SWF in each stream') + parser.add_option('-f', '--rtf', action='store_true', dest='rtf', help='Parse an RTF file to look for SWF in each embedded object') + + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + parser.print_help() + return + + # OLE MODE: + if options.ole: + for filename in args: + ole = olefile.OleFileIO(filename) + for direntry in ole.direntries: + if direntry is not None and direntry.entry_type == olefile.STGTY_STREAM: + f = ole._open(direntry.isectStart, direntry.size) + # check if data contains the SWF magic: FWS or CWS + data = f.getvalue() + if b'FWS' in data or b'CWS' in data: + print('OLE stream: %s' % repr(direntry.name)) + # call xxxswf to scan or extract Flash files: + xxxswf.disneyland(f, direntry.name, options) + f.close() + ole.close() + + # RTF MODE: + elif options.rtf: + for filename in args: + for index, orig_len, data in rtfobj.rtf_iter_objects(filename): + if b'FWS' in data or b'CWS' in data: + print('RTF embedded object size %d at index %08X' % (len(data), index)) + f = BytesIO(data) + name = 'RTF_embedded_object_%08X' % index + # call xxxswf to scan or extract Flash files: + xxxswf.disneyland(f, name, options) + + else: + xxxswf.main() + +if __name__ == '__main__': + main() diff -Nru remnux-oletools-0.51a/remnux-oletools/README.html remnux-oletools-0.51a/remnux-oletools/README.html --- remnux-oletools-0.51a/remnux-oletools/README.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/README.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,93 @@ + + + + + + + + + + +

python-oletools

oletools is a package of python tools to analyze Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), such as Microsoft Office documents or Outlook messages, mainly for malware analysis, forensics and debugging. It is based on the olefile parser. See http://www.decalage.info/python/oletools for more info.

Quick links: Home page - Download/Install - Documentation - Report Issues/Suggestions/Questions - Contact the Author - Repository - Updates on Twitter

Note: python-oletools is not related to OLETools published by BeCubed Software.

News

2016-11-01 v0.50: all oletools now support python 2 and 3. +
- olevba: several bugfixes and improvements.
- mraptor: improved detection, added mraptor_milter for Sendmail/Postfix integration.
- rtfobj: brand new RTF parser, obfuscation-aware, improved display, detect executable files in OLE Package objects.
- setup: now creates handy command-line scripts to run oletools from any directory.
2016-06-10 v0.47: olevba added PPT97 macros support, improved handling of malformed/incomplete documents, improved error handling and JSON output, now returns an exit code based on analysis results, new --relaxed option. rtfobj: improved parsing to handle obfuscated RTF documents, added -d option to set output dir. Moved repository and documentation to GitHub.
2016-04-19 v0.46: olevba does not deobfuscate VBA expressions by default (much faster), new option --deobf to enable it. Fixed color display bug on Windows for several tools.
2016-04-12 v0.45: improved rtfobj to handle several anti-analysis tricks, improved olevba to export results in JSON format.
2016-03-11 v0.44: improved olevba to extract and analyse strings from VBA Forms.
2016-03-04 v0.43: added new tool MacroRaptor (mraptor) to detect malicious macros, bugfix and slight improvements in olevba.
2016-02-07 v0.42: added two new tools oledir and olemap, better handling of malformed files and several bugfixes in olevba, improved display for olemeta.
2015-09-22 v0.41: added new --reveal option to olevba, to show the macro code with VBA strings deobfuscated.
2015-09-17 v0.40: Improved macro deobfuscation in olevba, to decode Hex and Base64 within VBA expressions. Display printable deobfuscated strings by default. Improved the VBA_Parser API. Improved performance. Fixed issue #23 with sys.stderr.
2015-06-19 v0.12: olevba can now deobfuscate VBA expressions with any combination of Chr, Asc, Val, StrReverse, Environ, +, &, using a VBA parser built with pyparsing. New options to display only the analysis results or only the macros source code. The analysis is now done on all the VBA modules at once.
2015-05-29 v0.11: Improved parsing of MHTML and ActiveMime/MSO files in olevba, added several suspicious keywords to VBA scanner (thanks to @ozhermit and Davy Douhine for the suggestions)
2015-05-06 v0.10: olevba now supports Word MHTML files with macros, aka "Single File Web Page" (.mht) - see issue #10 for more info
2015-03-23 v0.09: olevba now supports Word 2003 XML files, added anti-sandboxing/VM detection
2015-02-08 v0.08: olevba can now decode strings obfuscated with Hex/StrReverse/Base64/Dridex and extract IOCs. Added new triage mode, support for non-western codepages with olefile 0.42, improved API and display, several bugfixes.
2015-01-05 v0.07: improved olevba to detect suspicious keywords and IOCs in VBA macros, can now scan several files and open password-protected zip archives, added a Python API, upgraded OleFileIO_PL to olefile v0.41
2014-08-28 v0.06: added olevba, a new tool to extract VBA Macro source code from MS Office documents (97-2003 and 2007+). Improved documentation
2013-07-24 v0.05: added new tools olemeta and oletimes
2013-04-18 v0.04: fixed bug in rtfobj, added documentation for rtfobj
2012-11-09 v0.03: Improved pyxswf to extract Flash objects from RTF
2012-10-29 v0.02: Added oleid
2012-10-09 v0.01: Initial version of olebrowse and pyxswf
see changelog in source code for more info.

Tools in python-oletools:

olebrowse: A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint documents), to view and extract individual data streams.
oleid: to analyze OLE files to detect specific characteristics usually found in malicious files.
olemeta: to extract all standard properties (metadata) from OLE files.
oletimes: to extract creation and modification timestamps of all streams and storages.
oledir: to display all the directory entries of an OLE file, including free and orphaned entries.
olemap: to display a map of all the sectors in an OLE file.
olevba: to extract and analyze VBA Macro source code from MS Office documents (OLE and OpenXML).
MacroRaptor: to detect malicious VBA Macros
pyxswf: to detect, extract and analyze Flash objects (SWF) that may be embedded in files such as MS Office documents (e.g. Word, Excel) and RTF, which is especially useful for malware analysis.
oleobj: to extract embedded objects from OLE files.
rtfobj: to extract embedded objects from RTF files.
and a few others (coming soon)

Projects using oletools:

oletools are used by a number of projects and online malware analysis services, including Viper, REMnux, Hybrid-analysis.com, Joe Sandbox, Deepviz, Laika BOSS, Cuckoo Sandbox, Anlyz.io, pcodedmp and probably VirusTotal. (Please contact me if you have or know a project using oletools)

Download and Install:

To use python-oletools from the command line as analysis tools, you may simply download the latest release archive and extract the files into the directory of your choice.

You may also download the latest development version with the most recent features.

Another possibility is to use a git client to clone the repository (https://github.com/decalage2/oletools.git) into a folder. You can then update it easily in the future.

If you plan to use python-oletools with other Python applications or your own scripts, then the simplest solution is to use "pip install oletools" or "easy_install oletools" to download and install in one go. Otherwise you may download/extract the zip archive and run "setup.py install".

Important: to update oletools if it is already installed, you must run "pip install -U oletools", otherwise pip will not update it.

Documentation:

The latest version of the documentation can be found online, otherwise a copy is provided in the doc subfolder of the package.

How to Suggest Improvements, Report Issues or Contribute:

This is a personal open-source project, developed on my spare time. Any contribution, suggestion, feedback or bug report is welcome.

To suggest improvements, report a bug or any issue, please use the issue reporting page, providing all the information and files to reproduce the problem.

You may also contact the author directly to provide feedback.

The code is available in a GitHub repository. You may use it to submit enhancements using forks and pull requests.

License

This license applies to the python-oletools package, apart from the thirdparty folder which contains third-party files published with their own license.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

olevba contains modified source code from the officeparser project, published under the following MIT License (MIT):

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/README.rst remnux-oletools-0.51a/remnux-oletools/README.rst --- remnux-oletools-0.51a/remnux-oletools/README.rst 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/README.rst 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,289 @@ +python-oletools +=============== + +`oletools `__ is a package of +python tools to analyze `Microsoft OLE2 +files `__ +(also called Structured Storage, Compound File Binary Format or Compound +Document File Format), such as Microsoft Office documents or Outlook +messages, mainly for malware analysis, forensics and debugging. It is +based on the `olefile `__ parser. See +http://www.decalage.info/python/oletools for more info. + +**Quick links:** `Home +page `__ - +`Download/Install `__ +- `Documentation `__ - +`Report +Issues/Suggestions/Questions `__ +- `Contact the Author `__ - +`Repository `__ - `Updates on +Twitter `__ + +Note: python-oletools is not related to OLETools published by BeCubed +Software. + +News +---- + +- **2016-11-01 v0.50**: all oletools now support python 2 and 3. + + - olevba: several bugfixes and improvements. + - mraptor: improved detection, added mraptor\_milter for + Sendmail/Postfix integration. + - rtfobj: brand new RTF parser, obfuscation-aware, improved display, + detect executable files in OLE Package objects. + - setup: now creates handy command-line scripts to run oletools from + any directory. + +- 2016-06-10 v0.47: + `olevba `__ added + PPT97 macros support, improved handling of malformed/incomplete + documents, improved error handling and JSON output, now returns an + exit code based on analysis results, new --relaxed option. + `rtfobj `__: + improved parsing to handle obfuscated RTF documents, added -d option + to set output dir. Moved repository and documentation to GitHub. +- 2016-04-19 v0.46: + `olevba `__ does + not deobfuscate VBA expressions by default (much faster), new option + --deobf to enable it. Fixed color display bug on Windows for several + tools. +- 2016-04-12 v0.45: improved + `rtfobj `__ to + handle several `anti-analysis + tricks `__, improved + `olevba `__ to + export results in JSON format. +- 2016-03-11 v0.44: improved + `olevba `__ to + extract and analyse strings from VBA Forms. +- 2016-03-04 v0.43: added new tool + `MacroRaptor `__ + (mraptor) to detect malicious macros, bugfix and slight improvements + in `olevba `__. +- 2016-02-07 v0.42: added two new tools oledir and olemap, better + handling of malformed files and several bugfixes in + `olevba `__, + improved display for + `olemeta `__. +- 2015-09-22 v0.41: added new --reveal option to + `olevba `__, to + show the macro code with VBA strings deobfuscated. +- 2015-09-17 v0.40: Improved macro deobfuscation in + `olevba `__, to + decode Hex and Base64 within VBA expressions. Display printable + deobfuscated strings by default. Improved the VBA\_Parser API. + Improved performance. Fixed `issue + #23 `__ with + sys.stderr. +- 2015-06-19 v0.12: + `olevba `__ can + now deobfuscate VBA expressions with any combination of Chr, Asc, + Val, StrReverse, Environ, +, &, using a VBA parser built with + `pyparsing `__. New options to + display only the analysis results or only the macros source code. The + analysis is now done on all the VBA modules at once. +- 2015-05-29 v0.11: Improved parsing of MHTML and ActiveMime/MSO files + in `olevba `__, + added several suspicious keywords to VBA scanner (thanks to @ozhermit + and Davy Douhine for the suggestions) +- 2015-05-06 v0.10: + `olevba `__ now + supports Word MHTML files with macros, aka "Single File Web Page" + (.mht) - see `issue + #10 `__ for more + info +- 2015-03-23 v0.09: + `olevba `__ now + supports Word 2003 XML files, added anti-sandboxing/VM detection +- 2015-02-08 v0.08: + `olevba `__ can + now decode strings obfuscated with Hex/StrReverse/Base64/Dridex and + extract IOCs. Added new triage mode, support for non-western + codepages with olefile 0.42, improved API and display, several + bugfixes. +- 2015-01-05 v0.07: improved + `olevba `__ to + detect suspicious keywords and IOCs in VBA macros, can now scan + several files and open password-protected zip archives, added a + Python API, upgraded OleFileIO\_PL to olefile v0.41 +- 2014-08-28 v0.06: added + `olevba `__, a new + tool to extract VBA Macro source code from MS Office documents + (97-2003 and 2007+). Improved + `documentation `__ +- 2013-07-24 v0.05: added new tools + `olemeta `__ and + `oletimes `__ +- 2013-04-18 v0.04: fixed bug in rtfobj, added documentation for + `rtfobj `__ +- 2012-11-09 v0.03: Improved + `pyxswf `__ to + extract Flash objects from RTF +- 2012-10-29 v0.02: Added + `oleid `__ +- 2012-10-09 v0.01: Initial version of + `olebrowse `__ + and pyxswf +- see changelog in source code for more info. + +Tools in python-oletools: +------------------------- + +- `olebrowse `__: + A simple GUI to browse OLE files (e.g. MS Word, Excel, Powerpoint + documents), to view and extract individual data streams. +- `oleid `__: to + analyze OLE files to detect specific characteristics usually found in + malicious files. +- `olemeta `__: to + extract all standard properties (metadata) from OLE files. +- `oletimes `__: + to extract creation and modification timestamps of all streams and + storages. +- `oledir `__: to + display all the directory entries of an OLE file, including free and + orphaned entries. +- `olemap `__: to + display a map of all the sectors in an OLE file. +- `olevba `__: to + extract and analyze VBA Macro source code from MS Office documents + (OLE and OpenXML). +- `MacroRaptor `__: + to detect malicious VBA Macros +- `pyxswf `__: to + detect, extract and analyze Flash objects (SWF) that may be embedded + in files such as MS Office documents (e.g. Word, Excel) and RTF, + which is especially useful for malware analysis. +- `oleobj `__: to + extract embedded objects from OLE files. +- `rtfobj `__: to + extract embedded objects from RTF files. +- and a few others (coming soon) + +Projects using oletools: +------------------------ + +oletools are used by a number of projects and online malware analysis +services, including `Viper `__, +`REMnux `__, +`Hybrid-analysis.com `__, `Joe +Sandbox `__, +`Deepviz `__, `Laika +BOSS `__, `Cuckoo +Sandbox `__, +`Anlyz.io `__, +`pcodedmp `__ and probably +`VirusTotal `__. (Please `contact +me <(http://decalage.info/contact)>`__ if you have or know a project +using oletools) + +Download and Install: +--------------------- + +To use python-oletools from the command line as analysis tools, you may +simply `download the latest release +archive `__ and extract +the files into the directory of your choice. + +You may also download the `latest development +version `__ +with the most recent features. + +Another possibility is to use a git client to clone the repository +(https://github.com/decalage2/oletools.git) into a folder. You can then +update it easily in the future. + +If you plan to use python-oletools with other Python applications or +your own scripts, then the simplest solution is to use "**pip install +oletools**\ " or "**easy\_install oletools**\ " to download and install +in one go. Otherwise you may download/extract the zip archive and run +"**setup.py install**\ ". + +**Important: to update oletools** if it is already installed, you must +run **"pip install -U oletools"**, otherwise pip will not update it. + +Documentation: +-------------- + +The latest version of the documentation can be found +`online `__, otherwise a +copy is provided in the doc subfolder of the package. + +How to Suggest Improvements, Report Issues or Contribute: +--------------------------------------------------------- + +This is a personal open-source project, developed on my spare time. Any +contribution, suggestion, feedback or bug report is welcome. + +To suggest improvements, report a bug or any issue, please use the +`issue reporting page `__, +providing all the information and files to reproduce the problem. + +You may also `contact the author `__ +directly to provide feedback. + +The code is available in `a GitHub +repository `__. You may use it to +submit enhancements using forks and pull requests. + +License +------- + +This license applies to the python-oletools package, apart from the +thirdparty folder which contains third-party files published with their +own license. + +The python-oletools package is copyright (c) 2012-2016 Philippe Lagadec +(http://www.decalage.info) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------- + +olevba contains modified source code from the officeparser project, +published under the following MIT License (MIT): + +officeparser is copyright (c) 2014 John William Davison + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff -Nru remnux-oletools-0.51a/remnux-oletools/rtfobj.py remnux-oletools-0.51a/remnux-oletools/rtfobj.py --- remnux-oletools-0.51a/remnux-oletools/rtfobj.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/rtfobj.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,806 @@ +#!/usr/bin/env python +from __future__ import print_function + +""" +rtfobj.py + +rtfobj is a Python module to extract embedded objects from RTF files, such as +OLE ojects. It can be used as a Python library or a command-line tool. + +Usage: rtfobj.py + +rtfobj project website: http://www.decalage.info/python/rtfobj + +rtfobj is part of the python-oletools package: +http://www.decalage.info/python/oletools +""" + +#=== LICENSE ================================================================= + +# rtfobj is copyright (c) 2012-2016, Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2012-11-09 v0.01 PL: - first version +# 2013-04-02 v0.02 PL: - fixed bug in main +# 2015-12-09 v0.03 PL: - configurable logging, CLI options +# - extract OLE 1.0 objects +# - extract files from OLE Package objects +# 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr +# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks +# 2016-05-06 v0.47 TJ: - added option -d to set the output directory +# (contribution by Thomas Jarosch) +# TJ: - sanitize filenames to avoid special characters +# 2016-05-29 PL: - improved parsing, fixed issue #42 +# 2016-07-13 v0.50 PL: - new RtfParser and RtfObjParser classes +# 2016-07-18 SL: - added Python 3.5 support +# 2016-07-19 PL: - fixed Python 2.6-2.7 support +# 2016-07-30 PL: - new API with class RtfObject +# - backward-compatible API rtf_iter_objects (fixed issue #70) +# 2016-07-31 PL: - table output with tablestream +# 2016-08-01 PL: - detect executable filenames in OLE Package +# 2016-08-08 PL: - added option -s to save objects to files +# 2016-08-09 PL: - fixed issue #78, improved regex +# 2016-09-06 PL: - fixed issue #83, backward compatible API + +__version__ = '0.50' + +# ------------------------------------------------------------------------------ +# TODO: +# - allow semicolon within hex, as found in this sample: +# http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html +# TODO: use OleObject and OleNativeStream in RtfObject instead of copying each attribute +# TODO: option -e to extract an object, -e all for all objects +# TODO: option to choose which destinations to include (objdata by default) +# TODO: option to display SHA256 or MD5 hashes of objects in table + + +# === IMPORTS ================================================================= + +import re, os, sys, binascii, logging, optparse +import os.path + +from thirdparty.xglob import xglob +from oleobj import OleObject, OleNativeStream +import oleobj + +from thirdparty.tablestream import tablestream + + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + +# a global logger object used for debugging: +log = get_logger('rtfobj') + + +#=== CONSTANTS================================================================= + +# REGEX pattern to extract embedded OLE objects in hexadecimal format: + +# alphanum digit: [0-9A-Fa-f] +HEX_DIGIT = b'[0-9A-Fa-f]' + +# hex char = two alphanum digits: [0-9A-Fa-f]{2} +# HEX_CHAR = r'[0-9A-Fa-f]{2}' +# in fact MS Word allows whitespaces in between the hex digits! +# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]' +# Even worse, MS Word also allows ANY RTF-style tag {*} in between!! +# AND the tags can be nested... +#SINGLE_RTF_TAG = r'[{][^{}]*[}]' +# Actually RTF tags may contain braces escaped with backslash (\{ \}): +SINGLE_RTF_TAG = b'[{](?:\\\\.|[^{}\\\\])*[}]' + +# Nested tags, two levels (because Python's re does not support nested matching): +# NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]' +NESTED_RTF_TAG = b'[{](?:\\\\.|[^{}\\\\]|'+SINGLE_RTF_TAG+b')*[}]' + +# AND it is also allowed to insert ANY control word or control symbol (ignored) +# According to Rich Text Format (RTF) Specification Version 1.9.1, +# section "Control Word": +# control word = \ +# delimiter = space, OR signed integer followed by any non-digit, +# OR any character except letter and digit +# examples of valid control words: +# "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{" +# control symbol = \ (followed by anything) + +ASCII_NAME = b'([a-zA-Z]{1,250})' + +# using Python's re lookahead assumption: +# (?=...) Matches if ... matches next, but doesn't consume any of the string. +# This is called a lookahead assertion. For example, Isaac (?=Asimov) will +# match 'Isaac ' only if it's followed by 'Asimov'. + +# TODO: Find the actual limit on the number of digits for Word +# SIGNED_INTEGER = r'(-?\d{1,250})' +SIGNED_INTEGER = b'(-?\\d+)' + +# Note for issue #78: need to match "\A-" not followed by digits +CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])))' + +re_control_word = re.compile(CONTROL_WORD) + +# Note for issue #78: need to match "\" followed by digit (any non-alpha) +CONTROL_SYMBOL = b'(?:\\\\[^a-zA-Z])' +re_control_symbol = re.compile(CONTROL_SYMBOL) + +# Text that is not a control word/symbol or a group: +TEXT = b'[^{}\\\\]+' +re_text = re.compile(TEXT) + +# ignored whitespaces and tags within a hex block: +IGNORED = b'(?:\\s|'+NESTED_RTF_TAG+b'|'+CONTROL_SYMBOL+b'|'+CONTROL_WORD+b')*' +#IGNORED = r'\s*' + +# HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT + +# several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,} +# + word boundaries +# HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b' +# at least 1 hex char: +# HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+' +# at least 1 hex char, followed by whitespace or CR/LF: +# HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+' +# + word boundaries around hex block +# HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*' +# at least one block of hex and whitespace chars, followed by closing curly bracket: +# HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}' +# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE + +#TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b' +# PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b' +PATTERN = b'\\b(?:' + HEX_DIGIT + IGNORED + b'){7,}' + HEX_DIGIT + b'\\b' + +# at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s* +# PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' +# improved pattern, allowing semicolons within hex: +#PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}' + +re_hexblock = re.compile(PATTERN) +re_embedded_tags = re.compile(IGNORED) +re_decimal = re.compile(b'\\d+') + +re_delimiter = re.compile(b'[ \\t\\r\\n\\f\\v]') + +DELIMITER = b'[ \\t\\r\\n\\f\\v]' +DELIMITERS_ZeroOrMore = b'[ \\t\\r\\n\\f\\v]*' +BACKSLASH_BIN = b'\\\\bin' +# According to my tests, Word accepts up to 250 digits (leading zeroes) +DECIMAL_GROUP = b'(\d{1,250})' + +re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN + + DECIMAL_GROUP + DELIMITER) +re_delim_hexblock = re.compile(DELIMITER + PATTERN) + +# TODO: use a frozenset instead of a regex? +re_executable_extensions = re.compile( + r"(?i)\.(EXE|COM|PIF|GADGET|MSI|MSP|MSC|VBS|VBE|VB|JSE|JS|WSF|WSC|WSH|WS|BAT|CMD|DLL|SCR|HTA|CPL|CLASS|JAR|PS1XML|PS1|PS2XML|PS2|PSC1|PSC2|SCF|LNK|INF|REG)\b") + +# Destination Control Words, according to MS RTF Specifications v1.9.1: +DESTINATION_CONTROL_WORDS = frozenset(( + b"aftncn", b"aftnsep", b"aftnsepc", b"annotation", b"atnauthor", b"atndate", b"atnicn", b"atnid", b"atnparent", b"atnref", + b"atntime", b"atrfend", b"atrfstart", b"author", b"background", b"bkmkend", b"bkmkstart", b"blipuid", b"buptim", b"category", + b"colorschememapping", b"colortbl", b"comment", b"company", b"creatim", b"datafield", b"datastore", b"defchp", b"defpap", + b"do", b"doccomm", b"docvar", b"dptxbxtext", b"ebcend", b"ebcstart", b"factoidname", b"falt", b"fchars", b"ffdeftext", + b"ffentrymcr", b"ffexitmcr", b"ffformat", b"ffhelptext", b"ffl", b"ffname",b"ffstattext", b"field", b"file", b"filetbl", + b"fldinst", b"fldrslt", b"fldtype", b"fname", b"fontemb", b"fontfile", b"fonttbl", b"footer", b"footerf", b"footerl", + b"footerr", b"footnote", b"formfield", b"ftncn", b"ftnsep", b"ftnsepc", b"g", b"generator", b"gridtbl", b"header", b"headerf", + b"headerl", b"headerr", b"hl", b"hlfr", b"hlinkbase", b"hlloc", b"hlsrc", b"hsv", b"htmltag", b"info", b"keycode", b"keywords", + b"latentstyles", b"lchars", b"levelnumbers", b"leveltext", b"lfolevel", b"linkval", b"list", b"listlevel", b"listname", + b"listoverride", b"listoverridetable", b"listpicture", b"liststylename", b"listtable", b"listtext", b"lsdlockedexcept", + b"macc", b"maccPr", b"mailmerge", b"maln",b"malnScr", b"manager", b"margPr", b"mbar", b"mbarPr", b"mbaseJc", b"mbegChr", + b"mborderBox", b"mborderBoxPr", b"mbox", b"mboxPr", b"mchr", b"mcount", b"mctrlPr", b"md", b"mdeg", b"mdegHide", b"mden", + b"mdiff", b"mdPr", b"me", b"mendChr", b"meqArr", b"meqArrPr", b"mf", b"mfName", b"mfPr", b"mfunc", b"mfuncPr",b"mgroupChr", + b"mgroupChrPr",b"mgrow", b"mhideBot", b"mhideLeft", b"mhideRight", b"mhideTop", b"mhtmltag", b"mlim", b"mlimloc", b"mlimlow", + b"mlimlowPr", b"mlimupp", b"mlimuppPr", b"mm", b"mmaddfieldname", b"mmath", b"mmathPict", b"mmathPr",b"mmaxdist", b"mmc", + b"mmcJc", b"mmconnectstr", b"mmconnectstrdata", b"mmcPr", b"mmcs", b"mmdatasource", b"mmheadersource", b"mmmailsubject", + b"mmodso", b"mmodsofilter", b"mmodsofldmpdata", b"mmodsomappedname", b"mmodsoname", b"mmodsorecipdata", b"mmodsosort", + b"mmodsosrc", b"mmodsotable", b"mmodsoudl", b"mmodsoudldata", b"mmodsouniquetag", b"mmPr", b"mmquery", b"mmr", b"mnary", + b"mnaryPr", b"mnoBreak", b"mnum", b"mobjDist", b"moMath", b"moMathPara", b"moMathParaPr", b"mopEmu", b"mphant", b"mphantPr", + b"mplcHide", b"mpos", b"mr", b"mrad", b"mradPr", b"mrPr", b"msepChr", b"mshow", b"mshp", b"msPre", b"msPrePr", b"msSub", + b"msSubPr", b"msSubSup", b"msSubSupPr", b"msSup", b"msSupPr", b"mstrikeBLTR", b"mstrikeH", b"mstrikeTLBR", b"mstrikeV", + b"msub", b"msubHide", b"msup", b"msupHide", b"mtransp", b"mtype", b"mvertJc", b"mvfmf", b"mvfml", b"mvtof", b"mvtol", + b"mzeroAsc", b"mzeroDesc", b"mzeroWid", b"nesttableprops", b"nexctfile", b"nonesttables", b"objalias", b"objclass", + b"objdata", b"object", b"objname", b"objsect", b"objtime", b"oldcprops", b"oldpprops", b"oldsprops", b"oldtprops", + b"oleclsid", b"operator", b"panose", b"password", b"passwordhash", b"pgp", b"pgptbl", b"picprop", b"pict", b"pn", b"pnseclvl", + b"pntext", b"pntxta", b"pntxtb", b"printim", b"private", b"propname", b"protend", b"protstart", b"protusertbl", b"pxe", + b"result", b"revtbl", b"revtim", b"rsidtbl", b"rtf", b"rxe", b"shp", b"shpgrp", b"shpinst", b"shppict", b"shprslt", b"shptxt", + b"sn", b"sp", b"staticval", b"stylesheet", b"subject", b"sv", b"svb", b"tc", b"template", b"themedata", b"title", b"txe", b"ud", + b"upr", b"userprops", b"wgrffmtfilter", b"windowcaption", b"writereservation", b"writereservhash", b"xe", b"xform", + b"xmlattrname", b"xmlattrvalue", b"xmlclose", b"xmlname", b"xmlnstbl", b"xmlopen" + )) + + +# some str methods on Python 2.x return characters, +# while the equivalent bytes methods return integers on Python 3.x: +if sys.version_info[0] <= 2: + # Python 2.x - Characters (str) + BACKSLASH = '\\' + BRACE_OPEN = '{' + BRACE_CLOSE = '}' +else: + # Python 3.x - Integers + BACKSLASH = ord('\\') + BRACE_OPEN = ord('{') + BRACE_CLOSE = ord('}') + + +#=== CLASSES ================================================================= + +class Destination(object): + """ + Stores the data associated with a destination control word + """ + def __init__(self, cword=None): + self.cword = cword + self.data = b'' + self.start = None + self.end = None + self.group_level = 0 + + +# class Group(object): +# """ +# Stores the data associated with a group between braces {...} +# """ +# def __init__(self, cword=None): +# self.start = None +# self.end = None +# self.level = None + + + +class RtfParser(object): + """ + Very simple generic RTF parser + """ + + def __init__(self, data): + self.data = data + self.index = 0 + self.size = len(data) + self.group_level = 0 + # default destination for the document text: + document_destination = Destination() + self.destinations = [document_destination] + self.current_destination = document_destination + + def parse(self): + self.index = 0 + while self.index < self.size: + if self.data[self.index] == BRACE_OPEN: + self._open_group() + self.index += 1 + continue + if self.data[self.index] == BRACE_CLOSE: + self._close_group() + self.index += 1 + continue + if self.data[self.index] == BACKSLASH: + m = re_control_word.match(self.data, self.index) + if m: + cword = m.group(1) + param = None + if len(m.groups()) > 1: + param = m.group(2) + # log.debug('control word %r at index %Xh - cword=%r param=%r' % (m.group(), self.index, cword, param)) + self._control_word(m, cword, param) + self.index += len(m.group()) + # if it's \bin, call _bin after updating index + if cword == b'bin': + self._bin(m, param) + continue + m = re_control_symbol.match(self.data, self.index) + if m: + self.control_symbol(m) + self.index += len(m.group()) + continue + m = re_text.match(self.data, self.index) + if m: + self._text(m) + self.index += len(m.group()) + continue + raise RuntimeError('Should not have reached this point - index=%Xh' % self.index) + self.end_of_file() + + + def _open_group(self): + self.group_level += 1 + #log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level)) + # call user method AFTER increasing the level: + self.open_group() + + def open_group(self): + #log.debug('open group at index %Xh' % self.index) + pass + + def _close_group(self): + #log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level)) + # call user method BEFORE decreasing the level: + self.close_group() + # if the destination level is the same as the group level, close the destination: + if self.group_level == self.current_destination.group_level: + # log.debug('Current Destination %r level = %d => Close Destination' % ( + # self.current_destination.cword, self.current_destination.group_level)) + self._close_destination() + else: + # log.debug('Current Destination %r level = %d => Continue with same Destination' % ( + # self.current_destination.cword, self.current_destination.group_level)) + pass + self.group_level -= 1 + # log.debug('Decreased group level to %d' % self.group_level) + + def close_group(self): + #log.debug('close group at index %Xh' % self.index) + pass + + def _open_destination(self, matchobject, cword): + # if the current destination is at the same group level, close it first: + if self.current_destination.group_level == self.group_level: + self._close_destination() + new_dest = Destination(cword) + new_dest.group_level = self.group_level + self.destinations.append(new_dest) + self.current_destination = new_dest + # start of the destination is right after the control word: + new_dest.start = self.index + len(matchobject.group()) + # log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level)) + # call the corresponding user method for additional processing: + self.open_destination(self.current_destination) + + def open_destination(self, destination): + pass + + def _close_destination(self): + # log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword, + # self.index, self.current_destination.group_level)) + self.current_destination.end = self.index + # call the corresponding user method for additional processing: + self.close_destination(self.current_destination) + if len(self.destinations)>0: + # remove the current destination from the stack, and go back to the previous one: + self.destinations.pop() + if len(self.destinations) > 0: + self.current_destination = self.destinations[-1] + else: + # log.debug('All destinations are closed, keeping the document destination open') + pass + + def close_destination(self, destination): + pass + + def _control_word(self, matchobject, cword, param): + #log.debug('control word %r at index %Xh' % (matchobject.group(), self.index)) + if cword in DESTINATION_CONTROL_WORDS: + # log.debug('%r is a destination control word: starting a new destination' % cword) + self._open_destination(matchobject, cword) + # call the corresponding user method for additional processing: + self.control_word(matchobject, cword, param) + + def control_word(self, matchobject, cword, param): + pass + + def control_symbol(self, matchobject): + #log.debug('control symbol %r at index %Xh' % (matchobject.group(), self.index)) + pass + + def _text(self, matchobject): + text = matchobject.group() + self.current_destination.data += text + self.text(matchobject, text) + + def text(self, matchobject, text): + #log.debug('text %r at index %Xh' % (matchobject.group(), self.index)) + pass + + def _bin(self, matchobject, param): + binlen = int(param) + log.debug('\\bin: reading %d bytes of binary data' % binlen) + # TODO: handle optional space? + # TODO: handle negative length, and length greater than data + bindata = self.data[self.index:self.index + binlen] + self.index += binlen + self.bin(bindata) + + def bin(self, bindata): + pass + + def _end_of_file(self): + # log.debug('%Xh Reached End of File') + # close any group/destination that is still open: + while self.group_level > 0: + # log.debug('Group Level = %d, closing group' % self.group_level) + self._close_group() + self.end_of_file() + + def end_of_file(self): + pass + + +class RtfObject(object): + """ + An object or a file (OLE Package) embedded into an RTF document + """ + def __init__(self): + """ + RtfObject constructor + """ + # start and end index in the RTF file: + self.start = None + self.end = None + # raw object data encoded in hexadecimal, as found in the RTF file: + self.hexdata = None + # raw object data in binary form, decoded from hexadecimal + self.rawdata = None + # OLE object data (extracted from rawdata) + self.is_ole = False + self.oledata = None + self.format_id = None + self.class_name = None + self.oledata_size = None + # OLE Package data (extracted from oledata) + self.is_package = False + self.olepkgdata = None + self.filename = None + self.src_path = None + self.temp_path = None + + + +class RtfObjParser(RtfParser): + """ + Specialized RTF parser to extract OLE objects + """ + + def __init__(self, data): + super(RtfObjParser, self).__init__(data) + # list of RtfObjects found + self.objects = [] + + def open_destination(self, destination): + if destination.cword == b'objdata': + log.debug('*** Start object data at index %Xh' % destination.start) + + def close_destination(self, destination): + if destination.cword == b'objdata': + log.debug('*** Close object data at index %Xh' % self.index) + rtfobj = RtfObject() + self.objects.append(rtfobj) + rtfobj.start = destination.start + rtfobj.end = destination.end + # Filter out all whitespaces first (just ignored): + hexdata1 = destination.data.translate(None, b' \t\r\n\f\v') + # Then filter out any other non-hex character: + hexdata = re.sub(b'[^a-hA-H0-9]', b'', hexdata1) + if len(hexdata) < len(hexdata1): + # this is only for debugging: + nonhex = re.sub(b'[a-hA-H0-9]', b'', hexdata1) + log.debug('Found non-hex chars in hexdata: %r' % nonhex) + # MS Word accepts an extra hex digit, so we need to trim it if present: + if len(hexdata) & 1: + log.debug('Odd length, trimmed last byte.') + hexdata = hexdata[:-1] + rtfobj.hexdata = hexdata + object_data = binascii.unhexlify(hexdata) + rtfobj.rawdata = object_data + # TODO: check if all hex data is extracted properly + + obj = OleObject() + try: + obj.parse(object_data) + rtfobj.format_id = obj.format_id + rtfobj.class_name = obj.class_name + rtfobj.oledata_size = obj.data_size + rtfobj.oledata = obj.data + rtfobj.is_ole = True + if obj.class_name.lower() == 'package': + opkg = OleNativeStream(bindata=obj.data) + rtfobj.filename = opkg.filename + rtfobj.src_path = opkg.src_path + rtfobj.temp_path = opkg.temp_path + rtfobj.olepkgdata = opkg.data + rtfobj.is_package = True + except: + pass + log.debug('*** Not an OLE 1.0 Object') + + def bin(self, bindata): + if self.current_destination.cword == 'objdata': + # TODO: keep track of this, because it is unusual and indicates potential obfuscation + # trick: hexlify binary data, add it to hex data + self.current_destination.data += binascii.hexlify(bindata) + + def control_word(self, matchobject, cword, param): + # TODO: extract useful cwords such as objclass + # TODO: keep track of cwords inside objdata, because it is unusual and indicates potential obfuscation + # TODO: same with control symbols, and opening bracket + pass + + +#=== FUNCTIONS =============================================================== + +def rtf_iter_objects(filename, min_size=32): + """ + [DEPRECATED] Backward-compatible API, for applications using the old rtfobj: + Open a RTF file, extract each embedded object encoded in hexadecimal of + size > min_size, yield the index of the object in the RTF file, the original + length in the RTF file, and the decoded object data in binary format. + This is an iterator. + + :param filename: str, RTF file name/path to open on disk + :param min_size: ignored, kept for backward compatibility + :returns: iterator, yielding tuples (start index, original length, binary data) + """ + data = open(filename, 'rb').read() + rtfp = RtfObjParser(data) + rtfp.parse() + for obj in rtfp.objects: + orig_len = obj.end - obj.start + yield obj.start, orig_len, obj.rawdata + + + + + +def sanitize_filename(filename, replacement='_', max_length=200): + """compute basename of filename. Replaces all non-whitelisted characters. + The returned filename is always a basename of the file.""" + basepath = os.path.basename(filename).strip() + sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath) + + while ".." in sane_fname: + sane_fname = sane_fname.replace('..', '.') + + while " " in sane_fname: + sane_fname = sane_fname.replace(' ', ' ') + + if not len(filename): + sane_fname = 'NONAME' + + # limit filename length + if max_length: + sane_fname = sane_fname[:max_length] + + return sane_fname + + +def process_file(container, filename, data, output_dir=None, save_object=False): + if output_dir: + if not os.path.isdir(output_dir): + log.info('creating output directory %s' % output_dir) + os.mkdir(output_dir) + + fname_prefix = os.path.join(output_dir, + sanitize_filename(filename)) + else: + base_dir = os.path.dirname(filename) + sane_fname = sanitize_filename(filename) + fname_prefix = os.path.join(base_dir, sane_fname) + + # TODO: option to extract objects to files (false by default) + if data is None: + data = open(filename, 'rb').read() + print('='*79) + print('File: %r - size: %d bytes' % (filename, len(data))) + tstream = tablestream.TableStream( + column_width=(3, 10, 31, 31), + header_row=('id', 'index', 'OLE Object', 'OLE Package'), + style=tablestream.TableStyleSlim + ) + rtfp = RtfObjParser(data) + rtfp.parse() + for rtfobj in rtfp.objects: + pkg_color = None + if rtfobj.is_ole: + ole_column = 'format_id: %d\n' % rtfobj.format_id + ole_column += 'class name: %r\n' % rtfobj.class_name + ole_column += 'data size: %d' % rtfobj.oledata_size + if rtfobj.is_package: + pkg_column = 'Filename: %r\n' % rtfobj.filename + pkg_column += 'Source path: %r\n' % rtfobj.src_path + pkg_column += 'Temp path = %r' % rtfobj.temp_path + pkg_color = 'yellow' + # check if the file extension is executable: + _, ext = os.path.splitext(rtfobj.filename) + log.debug('File extension: %r' % ext) + if re_executable_extensions.match(ext): + pkg_color = 'red' + pkg_column += '\nEXECUTABLE FILE' + else: + pkg_column = 'Not an OLE Package' + else: + pkg_column = '' + ole_column = 'Not a well-formed OLE object' + tstream.write_row(( + rtfp.objects.index(rtfobj), + # filename, + '%08Xh' % rtfobj.start, + ole_column, + pkg_column + ), colors=(None, None, None, pkg_color) + ) + tstream.write_sep() + if save_object: + if save_object == 'all': + objects = rtfp.objects + else: + try: + i = int(save_object) + objects = [ rtfp.objects[i] ] + except: + log.error('The -s option must be followed by an object index or all, such as "-s 2" or "-s all"') + return + for rtfobj in objects: + i = objects.index(rtfobj) + if rtfobj.is_package: + print('Saving file from OLE Package in object #%d:' % i) + print(' Filename = %r' % rtfobj.filename) + print(' Source path = %r' % rtfobj.src_path) + print(' Temp path = %r' % rtfobj.temp_path) + if rtfobj.filename: + fname = '%s_%s' % (fname_prefix, + sanitize_filename(rtfobj.filename)) + else: + fname = '%s_object_%08X.noname' % (fname_prefix, rtfobj.start) + print(' saving to file %s' % fname) + open(fname, 'wb').write(rtfobj.olepkgdata) + elif rtfobj.is_ole: + print('Saving file embedded in OLE object #%d:' % i) + print(' format_id = %d' % rtfobj.format_id) + print(' class name = %r' % rtfobj.class_name) + print(' data size = %d' % rtfobj.oledata_size) + # set a file extension according to the class name: + class_name = rtfobj.class_name.lower() + if class_name.startswith(b'word'): + ext = 'doc' + elif class_name.startswith(b'package'): + ext = 'package' + else: + ext = 'bin' + fname = '%s_object_%08X.%s' % (fname_prefix, rtfobj.start, ext) + print(' saving to file %s' % fname) + open(fname, 'wb').write(rtfobj.oledata) + else: + print('Saving raw data in object #%d:' % i) + fname = '%s_object_%08X.raw' % (fname_prefix, rtfobj.start) + print(' saving object to file %s' % fname) + open(fname, 'wb').write(rtfobj.rawdata) + + +#=== MAIN ================================================================= + +def main(): + # print banner with version + print ('rtfobj %s - http://decalage.info/python/oletools' % __version__) + print ('THIS IS WORK IN PROGRESS - Check updates regularly!') + print ('Please report any issue at https://github.com/decalage2/oletools/issues') + print ('') + + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + # parser.add_option('-o', '--outfile', dest='outfile', + # help='output file') + # parser.add_option('-c', '--csv', dest='csv', + # help='export results to a CSV file') + parser.add_option("-r", action="store_true", dest="recursive", + help='find files recursively in subdirectories.') + parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, + help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)') + parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + parser.add_option("-s", "--save", dest='save_object', type='str', default=None, + help='Save the object corresponding to the provided number to a file, for example "-s 2". Use "-s all" to save all objects at once.') + # parser.add_option("-o", "--outfile", dest='outfile', type='str', default=None, + # help='Filename to be used when saving an object to a file.') + parser.add_option("-d", type="str", dest="output_dir", + help='use specified directory to save output files.', default=None) + # parser.add_option("--pkg", action="store_true", dest="save_pkg", + # help='Save OLE Package binary data of extracted objects (file embedded into an OLE Package).') + # parser.add_option("--ole", action="store_true", dest="save_ole", + # help='Save OLE binary data of extracted objects (object data without the OLE container).') + # parser.add_option("--raw", action="store_true", dest="save_raw", + # help='Save raw binary data of extracted objects (decoded from hex, including the OLE container).') + # parser.add_option("--hex", action="store_true", dest="save_hex", + # help='Save raw hexadecimal data of extracted objects (including the OLE container).') + + + (options, args) = parser.parse_args() + + # Print help if no arguments are passed + if len(args) == 0: + print (__doc__) + parser.print_help() + sys.exit() + + # Setup logging to the console: + # here we use stdout instead of stderr by default, so that the output + # can be redirected properly. + logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout, + format='%(levelname)-8s %(message)s') + # enable logging in the modules: + log.setLevel(logging.NOTSET) + oleobj.log.setLevel(logging.NOTSET) + + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, + zip_password=options.zip_password, zip_fname=options.zip_fname): + # ignore directory names stored in zip files: + if container and filename.endswith('/'): + continue + process_file(container, filename, data, output_dir=options.output_dir, + save_object=options.save_object) + + +if __name__ == '__main__': + main() + +# This code was developed while listening to The Mary Onettes "Lost" + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/codes.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/codes.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/codes.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/codes.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,229 @@ +"""Handles mapping between color names and ANSI codes and determining auto color codes.""" + +import sys +from collections import Mapping + +BASE_CODES = { + '/all': 0, 'b': 1, 'f': 2, 'i': 3, 'u': 4, 'flash': 5, 'outline': 6, 'negative': 7, 'invis': 8, 'strike': 9, + '/b': 22, '/f': 22, '/i': 23, '/u': 24, '/flash': 25, '/outline': 26, '/negative': 27, '/invis': 28, + '/strike': 29, '/fg': 39, '/bg': 49, + + 'black': 30, 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34, 'magenta': 35, 'cyan': 36, 'white': 37, + + 'bgblack': 40, 'bgred': 41, 'bggreen': 42, 'bgyellow': 43, 'bgblue': 44, 'bgmagenta': 45, 'bgcyan': 46, + 'bgwhite': 47, + + 'hiblack': 90, 'hired': 91, 'higreen': 92, 'hiyellow': 93, 'hiblue': 94, 'himagenta': 95, 'hicyan': 96, + 'hiwhite': 97, + + 'hibgblack': 100, 'hibgred': 101, 'hibggreen': 102, 'hibgyellow': 103, 'hibgblue': 104, 'hibgmagenta': 105, + 'hibgcyan': 106, 'hibgwhite': 107, + + 'autored': None, 'autoblack': None, 'automagenta': None, 'autowhite': None, 'autoblue': None, 'autoyellow': None, + 'autogreen': None, 'autocyan': None, + + 'autobgred': None, 'autobgblack': None, 'autobgmagenta': None, 'autobgwhite': None, 'autobgblue': None, + 'autobgyellow': None, 'autobggreen': None, 'autobgcyan': None, + + '/black': 39, '/red': 39, '/green': 39, '/yellow': 39, '/blue': 39, '/magenta': 39, '/cyan': 39, '/white': 39, + '/hiblack': 39, '/hired': 39, '/higreen': 39, '/hiyellow': 39, '/hiblue': 39, '/himagenta': 39, '/hicyan': 39, + '/hiwhite': 39, + + '/bgblack': 49, '/bgred': 49, '/bggreen': 49, '/bgyellow': 49, '/bgblue': 49, '/bgmagenta': 49, '/bgcyan': 49, + '/bgwhite': 49, '/hibgblack': 49, '/hibgred': 49, '/hibggreen': 49, '/hibgyellow': 49, '/hibgblue': 49, + '/hibgmagenta': 49, '/hibgcyan': 49, '/hibgwhite': 49, + + '/autored': 39, '/autoblack': 39, '/automagenta': 39, '/autowhite': 39, '/autoblue': 39, '/autoyellow': 39, + '/autogreen': 39, '/autocyan': 39, + + '/autobgred': 49, '/autobgblack': 49, '/autobgmagenta': 49, '/autobgwhite': 49, '/autobgblue': 49, + '/autobgyellow': 49, '/autobggreen': 49, '/autobgcyan': 49, +} + + +class ANSICodeMapping(Mapping): + """Read-only dictionary, resolves closing tags and automatic colors. Iterates only used color tags. + + :cvar bool DISABLE_COLORS: Disable colors (strip color codes). + :cvar bool LIGHT_BACKGROUND: Use low intensity color codes. + """ + + DISABLE_COLORS = False + LIGHT_BACKGROUND = False + + def __init__(self, value_markup): + """Constructor. + + :param str value_markup: String with {color} tags. + """ + self.whitelist = [k for k in BASE_CODES if '{' + k + '}' in value_markup] + + def __getitem__(self, item): + """Return value for key or None if colors are disabled. + + :param str item: Key. + + :return: Color code integer. + :rtype: int + """ + if item not in self.whitelist: + raise KeyError(item) + if self.DISABLE_COLORS: + return None + return getattr(self, item, BASE_CODES[item]) + + def __iter__(self): + """Iterate dictionary.""" + return iter(self.whitelist) + + def __len__(self): + """Dictionary length.""" + return len(self.whitelist) + + @classmethod + def disable_all_colors(cls): + """Disable all colors. Strips any color tags or codes.""" + cls.DISABLE_COLORS = True + + @classmethod + def enable_all_colors(cls): + """Enable all colors. Strips any color tags or codes.""" + cls.DISABLE_COLORS = False + + @classmethod + def disable_if_no_tty(cls): + """Disable all colors only if there is no TTY available. + + :return: True if colors are disabled, False if stderr or stdout is a TTY. + :rtype: bool + """ + if sys.stdout.isatty() or sys.stderr.isatty(): + return False + cls.disable_all_colors() + return True + + @classmethod + def set_dark_background(cls): + """Choose dark colors for all 'auto'-prefixed codes for readability on light backgrounds.""" + cls.LIGHT_BACKGROUND = False + + @classmethod + def set_light_background(cls): + """Choose dark colors for all 'auto'-prefixed codes for readability on light backgrounds.""" + cls.LIGHT_BACKGROUND = True + + @property + def autoblack(self): + """Return automatic black foreground color depending on background color.""" + return BASE_CODES['black' if ANSICodeMapping.LIGHT_BACKGROUND else 'hiblack'] + + @property + def autored(self): + """Return automatic red foreground color depending on background color.""" + return BASE_CODES['red' if ANSICodeMapping.LIGHT_BACKGROUND else 'hired'] + + @property + def autogreen(self): + """Return automatic green foreground color depending on background color.""" + return BASE_CODES['green' if ANSICodeMapping.LIGHT_BACKGROUND else 'higreen'] + + @property + def autoyellow(self): + """Return automatic yellow foreground color depending on background color.""" + return BASE_CODES['yellow' if ANSICodeMapping.LIGHT_BACKGROUND else 'hiyellow'] + + @property + def autoblue(self): + """Return automatic blue foreground color depending on background color.""" + return BASE_CODES['blue' if ANSICodeMapping.LIGHT_BACKGROUND else 'hiblue'] + + @property + def automagenta(self): + """Return automatic magenta foreground color depending on background color.""" + return BASE_CODES['magenta' if ANSICodeMapping.LIGHT_BACKGROUND else 'himagenta'] + + @property + def autocyan(self): + """Return automatic cyan foreground color depending on background color.""" + return BASE_CODES['cyan' if ANSICodeMapping.LIGHT_BACKGROUND else 'hicyan'] + + @property + def autowhite(self): + """Return automatic white foreground color depending on background color.""" + return BASE_CODES['white' if ANSICodeMapping.LIGHT_BACKGROUND else 'hiwhite'] + + @property + def autobgblack(self): + """Return automatic black background color depending on background color.""" + return BASE_CODES['bgblack' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgblack'] + + @property + def autobgred(self): + """Return automatic red background color depending on background color.""" + return BASE_CODES['bgred' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgred'] + + @property + def autobggreen(self): + """Return automatic green background color depending on background color.""" + return BASE_CODES['bggreen' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibggreen'] + + @property + def autobgyellow(self): + """Return automatic yellow background color depending on background color.""" + return BASE_CODES['bgyellow' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgyellow'] + + @property + def autobgblue(self): + """Return automatic blue background color depending on background color.""" + return BASE_CODES['bgblue' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgblue'] + + @property + def autobgmagenta(self): + """Return automatic magenta background color depending on background color.""" + return BASE_CODES['bgmagenta' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgmagenta'] + + @property + def autobgcyan(self): + """Return automatic cyan background color depending on background color.""" + return BASE_CODES['bgcyan' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgcyan'] + + @property + def autobgwhite(self): + """Return automatic white background color depending on background color.""" + return BASE_CODES['bgwhite' if ANSICodeMapping.LIGHT_BACKGROUND else 'hibgwhite'] + + +def list_tags(): + """List the available tags. + + :return: List of 4-item tuples: opening tag, closing tag, main ansi value, closing ansi value. + :rtype: list + """ + # Build reverse dictionary. Keys are closing tags, values are [closing ansi, opening tag, opening ansi]. + reverse_dict = dict() + for tag, ansi in sorted(BASE_CODES.items()): + if tag.startswith('/'): + reverse_dict[tag] = [ansi, None, None] + else: + reverse_dict['/' + tag][1:] = [tag, ansi] + + # Collapse + four_item_tuples = [(v[1], k, v[2], v[0]) for k, v in reverse_dict.items()] + + # Sort. + def sorter(four_item): + """Sort /all /fg /bg first, then b i u flash, then auto colors, then dark colors, finally light colors. + + :param iter four_item: [opening tag, closing tag, main ansi value, closing ansi value] + + :return Sorting weight. + :rtype: int + """ + if not four_item[2]: # /all /fg /bg + return four_item[3] - 200 + if four_item[2] < 10 or four_item[0].startswith('auto'): # b f i u or auto colors + return four_item[2] - 100 + return four_item[2] + four_item_tuples.sort(key=sorter) + + return four_item_tuples diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/color.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/color.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/color.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/color.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,220 @@ +"""Color class used by library users.""" + +from colorclass.core import ColorStr + + +class Color(ColorStr): + """Unicode (str in Python3) subclass with ANSI terminal text color support. + + Example syntax: Color('{red}Sample Text{/red}') + + Example without parsing logic: Color('{red}Sample Text{/red}', keep_tags=True) + + For a list of codes, call: colorclass.list_tags() + """ + + @classmethod + def colorize(cls, color, string, auto=False): + """Color-code entire string using specified color. + + :param str color: Color of string. + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + tag = '{0}{1}'.format('auto' if auto else '', color) + return cls('{%s}%s{/%s}' % (tag, string, tag)) + + @classmethod + def black(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('black', string, auto=auto) + + @classmethod + def bgblack(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgblack', string, auto=auto) + + @classmethod + def red(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('red', string, auto=auto) + + @classmethod + def bgred(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgred', string, auto=auto) + + @classmethod + def green(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('green', string, auto=auto) + + @classmethod + def bggreen(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bggreen', string, auto=auto) + + @classmethod + def yellow(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('yellow', string, auto=auto) + + @classmethod + def bgyellow(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgyellow', string, auto=auto) + + @classmethod + def blue(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('blue', string, auto=auto) + + @classmethod + def bgblue(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgblue', string, auto=auto) + + @classmethod + def magenta(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('magenta', string, auto=auto) + + @classmethod + def bgmagenta(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgmagenta', string, auto=auto) + + @classmethod + def cyan(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('cyan', string, auto=auto) + + @classmethod + def bgcyan(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgcyan', string, auto=auto) + + @classmethod + def white(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('white', string, auto=auto) + + @classmethod + def bgwhite(cls, string, auto=False): + """Color-code entire string. + + :param str string: String to colorize. + :param bool auto: Enable auto-color (dark/light terminal). + + :return: Class instance for colorized string. + :rtype: Color + """ + return cls.colorize('bgwhite', string, auto=auto) diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/core.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/core.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/core.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/core.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,342 @@ +"""String subclass that handles ANSI color codes.""" + +from colorclass.codes import ANSICodeMapping +from colorclass.parse import parse_input, RE_SPLIT +from colorclass.search import build_color_index, find_char_color + +PARENT_CLASS = type(u'') + + +def apply_text(incoming, func): + """Call `func` on text portions of incoming color string. + + :param iter incoming: Incoming string/ColorStr/string-like object to iterate. + :param func: Function to call with string portion as first and only parameter. + + :return: Modified string, same class type as incoming string. + """ + split = RE_SPLIT.split(incoming) + for i, item in enumerate(split): + if not item or RE_SPLIT.match(item): + continue + split[i] = func(item) + return incoming.__class__().join(split) + + +class ColorBytes(bytes): + """Str (bytes in Python3) subclass, .decode() overridden to return unicode (str in Python3) subclass instance.""" + + def __new__(cls, *args, **kwargs): + """Save original class so decode() returns an instance of it.""" + original_class = kwargs.pop('original_class') + combined_args = [cls] + list(args) + instance = bytes.__new__(*combined_args, **kwargs) + instance.original_class = original_class + return instance + + def decode(self, encoding='utf-8', errors='strict'): + """Decode using the codec registered for encoding. Default encoding is 'utf-8'. + + errors may be given to set a different error handling scheme. Default is 'strict' meaning that encoding errors + raise a UnicodeDecodeError. Other possible values are 'ignore' and 'replace' as well as any other name + registered with codecs.register_error that is able to handle UnicodeDecodeErrors. + + :param str encoding: Codec. + :param str errors: Error handling scheme. + """ + original_class = getattr(self, 'original_class') + return original_class(super(ColorBytes, self).decode(encoding, errors)) + + +class ColorStr(PARENT_CLASS): + """Core color class.""" + + def __new__(cls, *args, **kwargs): + """Parse color markup and instantiate.""" + keep_tags = kwargs.pop('keep_tags', False) + + # Parse string. + value_markup = args[0] if args else PARENT_CLASS() # e.g. '{red}test{/red}' + value_colors, value_no_colors = parse_input(value_markup, ANSICodeMapping.DISABLE_COLORS, keep_tags) + color_index = build_color_index(value_colors) + + # Instantiate. + color_args = [cls, value_colors] + list(args[1:]) + instance = PARENT_CLASS.__new__(*color_args, **kwargs) + + # Add additional attributes and return. + instance.value_colors = value_colors + instance.value_no_colors = value_no_colors + instance.has_colors = value_colors != value_no_colors + instance.color_index = color_index + return instance + + def __add__(self, other): + """Concatenate.""" + return self.__class__(self.value_colors + other, keep_tags=True) + + def __getitem__(self, item): + """Retrieve character.""" + try: + color_pos = self.color_index[int(item)] + except TypeError: # slice + return super(ColorStr, self).__getitem__(item) + return self.__class__(find_char_color(self.value_colors, color_pos), keep_tags=True) + + def __iter__(self): + """Yield one color-coded character at a time.""" + for color_pos in self.color_index: + yield self.__class__(find_char_color(self.value_colors, color_pos)) + + def __len__(self): + """Length of string without color codes (what users expect).""" + return self.value_no_colors.__len__() + + def __mod__(self, other): + """String substitution (like printf).""" + return self.__class__(self.value_colors % other, keep_tags=True) + + def __mul__(self, other): + """Multiply string.""" + return self.__class__(self.value_colors * other, keep_tags=True) + + def __repr__(self): + """Representation of a class instance (like datetime.datetime.now()).""" + return '{name}({value})'.format(name=self.__class__.__name__, value=repr(self.value_colors)) + + def capitalize(self): + """Return a copy of the string with only its first character capitalized.""" + return apply_text(self, lambda s: s.capitalize()) + + def center(self, width, fillchar=None): + """Return centered in a string of length width. Padding is done using the specified fill character or space. + + :param int width: Length of output string. + :param str fillchar: Use this character instead of spaces. + """ + if fillchar is not None: + result = self.value_no_colors.center(width, fillchar) + else: + result = self.value_no_colors.center(width) + return self.__class__(result.replace(self.value_no_colors, self.value_colors), keep_tags=True) + + def count(self, sub, start=0, end=-1): + """Return the number of non-overlapping occurrences of substring sub in string[start:end]. + + Optional arguments start and end are interpreted as in slice notation. + + :param str sub: Substring to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + return self.value_no_colors.count(sub, start, end) + + def endswith(self, suffix, start=0, end=None): + """Return True if ends with the specified suffix, False otherwise. + + With optional start, test beginning at that position. With optional end, stop comparing at that position. + suffix can also be a tuple of strings to try. + + :param str suffix: Suffix to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + args = [suffix, start] + ([] if end is None else [end]) + return self.value_no_colors.endswith(*args) + + def encode(self, encoding=None, errors='strict'): + """Encode using the codec registered for encoding. encoding defaults to the default encoding. + + errors may be given to set a different error handling scheme. Default is 'strict' meaning that encoding errors + raise a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and 'xmlcharrefreplace' as well as any + other name registered with codecs.register_error that is able to handle UnicodeEncodeErrors. + + :param str encoding: Codec. + :param str errors: Error handling scheme. + """ + return ColorBytes(super(ColorStr, self).encode(encoding, errors), original_class=self.__class__) + + def decode(self, encoding=None, errors='strict'): + """Decode using the codec registered for encoding. encoding defaults to the default encoding. + + errors may be given to set a different error handling scheme. Default is 'strict' meaning that encoding errors + raise a UnicodeDecodeError. Other possible values are 'ignore' and 'replace' as well as any other name + registered with codecs.register_error that is able to handle UnicodeDecodeErrors. + + :param str encoding: Codec. + :param str errors: Error handling scheme. + """ + return self.__class__(super(ColorStr, self).decode(encoding, errors), keep_tags=True) + + def find(self, sub, start=None, end=None): + """Return the lowest index where substring sub is found, such that sub is contained within string[start:end]. + + Optional arguments start and end are interpreted as in slice notation. + + :param str sub: Substring to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + return self.value_no_colors.find(sub, start, end) + + def format(self, *args, **kwargs): + """Return a formatted version, using substitutions from args and kwargs. + + The substitutions are identified by braces ('{' and '}'). + """ + return self.__class__(super(ColorStr, self).format(*args, **kwargs), keep_tags=True) + + def index(self, sub, start=None, end=None): + """Like S.find() but raise ValueError when the substring is not found. + + :param str sub: Substring to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + return self.value_no_colors.index(sub, start, end) + + def isalnum(self): + """Return True if all characters in string are alphanumeric and there is at least one character in it.""" + return self.value_no_colors.isalnum() + + def isalpha(self): + """Return True if all characters in string are alphabetic and there is at least one character in it.""" + return self.value_no_colors.isalpha() + + def isdecimal(self): + """Return True if there are only decimal characters in string, False otherwise.""" + return self.value_no_colors.isdecimal() + + def isdigit(self): + """Return True if all characters in string are digits and there is at least one character in it.""" + return self.value_no_colors.isdigit() + + def isnumeric(self): + """Return True if there are only numeric characters in string, False otherwise.""" + return self.value_no_colors.isnumeric() + + def isspace(self): + """Return True if all characters in string are whitespace and there is at least one character in it.""" + return self.value_no_colors.isspace() + + def istitle(self): + """Return True if string is a titlecased string and there is at least one character in it. + + That is uppercase characters may only follow uncased characters and lowercase characters only cased ones. Return + False otherwise. + """ + return self.value_no_colors.istitle() + + def isupper(self): + """Return True if all cased characters are uppercase and there is at least one cased character in it.""" + return self.value_no_colors.isupper() + + def join(self, iterable): + """Return a string which is the concatenation of the strings in the iterable. + + :param iterable: Join items in this iterable. + """ + return self.__class__(super(ColorStr, self).join(iterable), keep_tags=True) + + def ljust(self, width, fillchar=None): + """Return left-justified string of length width. Padding is done using the specified fill character or space. + + :param int width: Length of output string. + :param str fillchar: Use this character instead of spaces. + """ + if fillchar is not None: + result = self.value_no_colors.ljust(width, fillchar) + else: + result = self.value_no_colors.ljust(width) + return self.__class__(result.replace(self.value_no_colors, self.value_colors), keep_tags=True) + + def rfind(self, sub, start=None, end=None): + """Return the highest index where substring sub is found, such that sub is contained within string[start:end]. + + Optional arguments start and end are interpreted as in slice notation. + + :param str sub: Substring to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + return self.value_no_colors.rfind(sub, start, end) + + def rindex(self, sub, start=None, end=None): + """Like .rfind() but raise ValueError when the substring is not found. + + :param str sub: Substring to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + return self.value_no_colors.rindex(sub, start, end) + + def rjust(self, width, fillchar=None): + """Return right-justified string of length width. Padding is done using the specified fill character or space. + + :param int width: Length of output string. + :param str fillchar: Use this character instead of spaces. + """ + if fillchar is not None: + result = self.value_no_colors.rjust(width, fillchar) + else: + result = self.value_no_colors.rjust(width) + return self.__class__(result.replace(self.value_no_colors, self.value_colors), keep_tags=True) + + def splitlines(self, keepends=False): + """Return a list of the lines in the string, breaking at line boundaries. + + Line breaks are not included in the resulting list unless keepends is given and True. + + :param bool keepends: Include linebreaks. + """ + return [self.__class__(l) for l in self.value_colors.splitlines(keepends)] + + def startswith(self, prefix, start=0, end=-1): + """Return True if string starts with the specified prefix, False otherwise. + + With optional start, test beginning at that position. With optional end, stop comparing at that position. prefix + can also be a tuple of strings to try. + + :param str prefix: Prefix to search. + :param int start: Beginning position. + :param int end: Stop comparison at this position. + """ + return self.value_no_colors.startswith(prefix, start, end) + + def swapcase(self): + """Return a copy of the string with uppercase characters converted to lowercase and vice versa.""" + return apply_text(self, lambda s: s.swapcase()) + + def title(self): + """Return a titlecased version of the string. + + That is words start with uppercase characters, all remaining cased characters have lowercase. + """ + return apply_text(self, lambda s: s.title()) + + def translate(self, table): + """Return a copy of the string, where all characters have been mapped through the given translation table. + + Table must be a mapping of Unicode ordinals to Unicode ordinals, strings, or None. Unmapped characters are left + untouched. Characters mapped to None are deleted. + + :param table: Translation table. + """ + return apply_text(self, lambda s: s.translate(table)) + + def upper(self): + """Return a copy of the string converted to uppercase.""" + return apply_text(self, lambda s: s.upper()) + + def zfill(self, width): + """Pad a numeric string with zeros on the left, to fill a field of the specified width. + + The string is never truncated. + + :param int width: Length of output string. + """ + if not self.value_no_colors: + result = self.value_no_colors.zfill(width) + else: + result = self.value_colors.replace(self.value_no_colors, self.value_no_colors.zfill(width)) + return self.__class__(result, keep_tags=True) diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__init__.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__init__.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__init__.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,38 @@ +"""Colorful worry-free console applications for Linux, Mac OS X, and Windows. + +Supported natively on Linux and Mac OSX (Just Works), and on Windows it works the same if Windows.enable() is called. + +Gives you expected and sane results from methods like len() and .capitalize(). + +https://github.com/Robpol86/colorclass +https://pypi.python.org/pypi/colorclass +""" + +from colorclass.codes import list_tags # noqa +from colorclass.color import Color # noqa +from colorclass.toggles import disable_all_colors # noqa +from colorclass.toggles import disable_if_no_tty # noqa +from colorclass.toggles import enable_all_colors # noqa +from colorclass.toggles import is_enabled # noqa +from colorclass.toggles import is_light # noqa +from colorclass.toggles import set_dark_background # noqa +from colorclass.toggles import set_light_background # noqa +from colorclass.windows import Windows # noqa + + +__all__ = ( + 'Color', + 'disable_all_colors', + 'enable_all_colors', + 'is_enabled', + 'is_light', + 'list_tags', + 'set_dark_background', + 'set_light_background', + 'Windows', +) + + +__author__ = '@Robpol86' +__license__ = 'MIT' +__version__ = '2.2.0' diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/LICENSE.txt remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/LICENSE.txt --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Robpol86 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__main__.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__main__.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__main__.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/__main__.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,33 @@ +"""Called by "python -m". Allows package to be used as a script. + +Example usage: +echo "{red}Red{/red}" |python -m colorclass +""" + +from __future__ import print_function + +import fileinput +import os + +from colorclass.color import Color +from colorclass.toggles import disable_all_colors +from colorclass.toggles import enable_all_colors +from colorclass.toggles import set_dark_background +from colorclass.toggles import set_light_background +from colorclass.windows import Windows + +TRUTHY = ('true', '1', 'yes', 'on') + + +if __name__ == '__main__': + if os.environ.get('COLOR_ENABLE', '').lower() in TRUTHY: + enable_all_colors() + elif os.environ.get('COLOR_DISABLE', '').lower() in TRUTHY: + disable_all_colors() + if os.environ.get('COLOR_LIGHT', '').lower() in TRUTHY: + set_light_background() + elif os.environ.get('COLOR_DARK', '').lower() in TRUTHY: + set_dark_background() + Windows.enable() + for LINE in fileinput.input(): + print(Color(LINE)) diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/parse.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/parse.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/parse.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/parse.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,96 @@ +"""Parse color markup tags into ANSI escape sequences.""" + +import re + +from colorclass.codes import ANSICodeMapping, BASE_CODES + +CODE_GROUPS = ( + tuple(set(str(i) for i in BASE_CODES.values() if i and (40 <= i <= 49 or 100 <= i <= 109))), # bg colors + tuple(set(str(i) for i in BASE_CODES.values() if i and (30 <= i <= 39 or 90 <= i <= 99))), # fg colors + ('1', '22'), ('2', '22'), ('3', '23'), ('4', '24'), ('5', '25'), ('6', '26'), ('7', '27'), ('8', '28'), ('9', '29'), +) +RE_ANSI = re.compile(r'(\033\[([\d;]+)m)') +RE_COMBINE = re.compile(r'\033\[([\d;]+)m\033\[([\d;]+)m') +RE_SPLIT = re.compile(r'(\033\[[\d;]+m)') + + +def prune_overridden(ansi_string): + """Remove color codes that are rendered ineffective by subsequent codes in one escape sequence then sort codes. + + :param str ansi_string: Incoming ansi_string with ANSI color codes. + + :return: Color string with pruned color sequences. + :rtype: str + """ + multi_seqs = set(p for p in RE_ANSI.findall(ansi_string) if ';' in p[1]) # Sequences with multiple color codes. + + for escape, codes in multi_seqs: + r_codes = list(reversed(codes.split(';'))) + + # Nuke everything before {/all}. + try: + r_codes = r_codes[:r_codes.index('0') + 1] + except ValueError: + pass + + # Thin out groups. + for group in CODE_GROUPS: + for pos in reversed([i for i, n in enumerate(r_codes) if n in group][1:]): + r_codes.pop(pos) + + # Done. + reduced_codes = ';'.join(sorted(r_codes, key=int)) + if codes != reduced_codes: + ansi_string = ansi_string.replace(escape, '\033[' + reduced_codes + 'm') + + return ansi_string + + +def parse_input(tagged_string, disable_colors, keep_tags): + """Perform the actual conversion of tags to ANSI escaped codes. + + Provides a version of the input without any colors for len() and other methods. + + :param str tagged_string: The input unicode value. + :param bool disable_colors: Strip all colors in both outputs. + :param bool keep_tags: Skip parsing curly bracket tags into ANSI escape sequences. + + :return: 2-item tuple. First item is the parsed output. Second item is a version of the input without any colors. + :rtype: tuple + """ + codes = ANSICodeMapping(tagged_string) + output_colors = getattr(tagged_string, 'value_colors', tagged_string) + + # Convert: '{b}{red}' -> '\033[1m\033[31m' + if not keep_tags: + for tag, replacement in (('{' + k + '}', '' if v is None else '\033[%dm' % v) for k, v in codes.items()): + output_colors = output_colors.replace(tag, replacement) + + # Strip colors. + output_no_colors = RE_ANSI.sub('', output_colors) + if disable_colors: + return output_no_colors, output_no_colors + + # Combine: '\033[1m\033[31m' -> '\033[1;31m' + while True: + simplified = RE_COMBINE.sub(r'\033[\1;\2m', output_colors) + if simplified == output_colors: + break + output_colors = simplified + + # Prune: '\033[31;32;33;34;35m' -> '\033[35m' + output_colors = prune_overridden(output_colors) + + # Deduplicate: '\033[1;mT\033[1;mE\033[1;mS\033[1;mT' -> '\033[1;mTEST' + previous_escape = None + segments = list() + for item in (i for i in RE_SPLIT.split(output_colors) if i): + if RE_SPLIT.match(item): + if item != previous_escape: + segments.append(item) + previous_escape = item + else: + segments.append(item) + output_colors = ''.join(segments) + + return output_colors, output_no_colors diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/search.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/search.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/search.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/search.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,49 @@ +"""Determine color of characters that may or may not be adjacent to ANSI escape sequences.""" + +from colorclass.parse import RE_SPLIT + + +def build_color_index(ansi_string): + """Build an index between visible characters and a string with invisible color codes. + + :param str ansi_string: String with color codes (ANSI escape sequences). + + :return: Position of visible characters in color string (indexes match non-color string). + :rtype: tuple + """ + mapping = list() + color_offset = 0 + for item in (i for i in RE_SPLIT.split(ansi_string) if i): + if RE_SPLIT.match(item): + color_offset += len(item) + else: + for _ in range(len(item)): + mapping.append(color_offset) + color_offset += 1 + return tuple(mapping) + + +def find_char_color(ansi_string, pos): + """Determine what color a character is in the string. + + :param str ansi_string: String with color codes (ANSI escape sequences). + :param int pos: Position of the character in the ansi_string. + + :return: Character along with all surrounding color codes. + :rtype: str + """ + result = list() + position = 0 # Set to None when character is found. + for item in (i for i in RE_SPLIT.split(ansi_string) if i): + if RE_SPLIT.match(item): + result.append(item) + if position is not None: + position += len(item) + elif position is not None: + for char in item: + if position == pos: + result.append(char) + position = None + break + position += 1 + return ''.join(result) diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/toggles.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/toggles.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/toggles.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/toggles.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,42 @@ +"""Convenience functions to enable/disable features.""" + +from colorclass.codes import ANSICodeMapping + + +def disable_all_colors(): + """Disable all colors. Strip any color tags or codes.""" + ANSICodeMapping.disable_all_colors() + + +def enable_all_colors(): + """Enable colors.""" + ANSICodeMapping.enable_all_colors() + + +def disable_if_no_tty(): + """Disable all colors if there is no TTY available. + + :return: True if colors are disabled, False if stderr or stdout is a TTY. + :rtype: bool + """ + return ANSICodeMapping.disable_if_no_tty() + + +def is_enabled(): + """Are colors enabled.""" + return not ANSICodeMapping.DISABLE_COLORS + + +def set_light_background(): + """Choose dark colors for all 'auto'-prefixed codes for readability on light backgrounds.""" + ANSICodeMapping.set_light_background() + + +def set_dark_background(): + """Choose dark colors for all 'auto'-prefixed codes for readability on light backgrounds.""" + ANSICodeMapping.set_dark_background() + + +def is_light(): + """Are background colors for light backgrounds.""" + return ANSICodeMapping.LIGHT_BACKGROUND diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/windows.py remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/windows.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/windows.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/colorclass/windows.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,388 @@ +"""Windows console screen buffer handlers.""" + +from __future__ import print_function + +import atexit +import ctypes +import re +import sys + +from colorclass.codes import ANSICodeMapping, BASE_CODES +from colorclass.core import RE_SPLIT + +ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 +INVALID_HANDLE_VALUE = -1 +IS_WINDOWS = sys.platform == 'win32' +RE_NUMBER_SEARCH = re.compile(r'\033\[([\d;]+)m') +STD_ERROR_HANDLE = -12 +STD_OUTPUT_HANDLE = -11 +WINDOWS_CODES = { + '/all': -33, '/fg': -39, '/bg': -49, + + 'black': 0, 'red': 4, 'green': 2, 'yellow': 6, 'blue': 1, 'magenta': 5, 'cyan': 3, 'white': 7, + + 'bgblack': -8, 'bgred': 64, 'bggreen': 32, 'bgyellow': 96, 'bgblue': 16, 'bgmagenta': 80, 'bgcyan': 48, + 'bgwhite': 112, + + 'hiblack': 8, 'hired': 12, 'higreen': 10, 'hiyellow': 14, 'hiblue': 9, 'himagenta': 13, 'hicyan': 11, 'hiwhite': 15, + + 'hibgblack': 128, 'hibgred': 192, 'hibggreen': 160, 'hibgyellow': 224, 'hibgblue': 144, 'hibgmagenta': 208, + 'hibgcyan': 176, 'hibgwhite': 240, + + '/black': -39, '/red': -39, '/green': -39, '/yellow': -39, '/blue': -39, '/magenta': -39, '/cyan': -39, + '/white': -39, '/hiblack': -39, '/hired': -39, '/higreen': -39, '/hiyellow': -39, '/hiblue': -39, '/himagenta': -39, + '/hicyan': -39, '/hiwhite': -39, + + '/bgblack': -49, '/bgred': -49, '/bggreen': -49, '/bgyellow': -49, '/bgblue': -49, '/bgmagenta': -49, + '/bgcyan': -49, '/bgwhite': -49, '/hibgblack': -49, '/hibgred': -49, '/hibggreen': -49, '/hibgyellow': -49, + '/hibgblue': -49, '/hibgmagenta': -49, '/hibgcyan': -49, '/hibgwhite': -49, +} + + +class COORD(ctypes.Structure): + """COORD structure. http://msdn.microsoft.com/en-us/library/windows/desktop/ms682119.""" + + _fields_ = [ + ('X', ctypes.c_short), + ('Y', ctypes.c_short), + ] + + +class SmallRECT(ctypes.Structure): + """SMALL_RECT structure. http://msdn.microsoft.com/en-us/library/windows/desktop/ms686311.""" + + _fields_ = [ + ('Left', ctypes.c_short), + ('Top', ctypes.c_short), + ('Right', ctypes.c_short), + ('Bottom', ctypes.c_short), + ] + + +class ConsoleScreenBufferInfo(ctypes.Structure): + """CONSOLE_SCREEN_BUFFER_INFO structure. http://msdn.microsoft.com/en-us/library/windows/desktop/ms682093.""" + + _fields_ = [ + ('dwSize', COORD), + ('dwCursorPosition', COORD), + ('wAttributes', ctypes.c_ushort), + ('srWindow', SmallRECT), + ('dwMaximumWindowSize', COORD) + ] + + +def init_kernel32(kernel32=None): + """Load a unique instance of WinDLL into memory, set arg/return types, and get stdout/err handles. + + 1. Since we are setting DLL function argument types and return types, we need to maintain our own instance of + kernel32 to prevent overriding (or being overwritten by) user's own changes to ctypes.windll.kernel32. + 2. While we're doing all this we might as well get the handles to STDOUT and STDERR streams. + 3. If either stream has already been replaced set return value to INVALID_HANDLE_VALUE to indicate it shouldn't be + replaced. + + :raise AttributeError: When called on a non-Windows platform. + + :param kernel32: Optional mock kernel32 object. For testing. + + :return: Loaded kernel32 instance, stderr handle (int), stdout handle (int). + :rtype: tuple + """ + if not kernel32: + kernel32 = ctypes.LibraryLoader(ctypes.WinDLL).kernel32 # Load our own instance. Unique memory address. + kernel32.GetStdHandle.argtypes = [ctypes.c_ulong] + kernel32.GetStdHandle.restype = ctypes.c_void_p + kernel32.GetConsoleScreenBufferInfo.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ConsoleScreenBufferInfo), + ] + kernel32.GetConsoleScreenBufferInfo.restype = ctypes.c_long + + # Get handles. + if hasattr(sys.stderr, '_original_stream'): + stderr = INVALID_HANDLE_VALUE + else: + stderr = kernel32.GetStdHandle(STD_ERROR_HANDLE) + if hasattr(sys.stdout, '_original_stream'): + stdout = INVALID_HANDLE_VALUE + else: + stdout = kernel32.GetStdHandle(STD_OUTPUT_HANDLE) + + return kernel32, stderr, stdout + + +def get_console_info(kernel32, handle): + """Get information about this current console window. + + http://msdn.microsoft.com/en-us/library/windows/desktop/ms683231 + https://code.google.com/p/colorama/issues/detail?id=47 + https://bitbucket.org/pytest-dev/py/src/4617fe46/py/_io/terminalwriter.py + + Windows 10 Insider since around February 2016 finally introduced support for ANSI colors. No need to replace stdout + and stderr streams to intercept colors and issue multiple SetConsoleTextAttribute() calls for these consoles. + + :raise OSError: When GetConsoleScreenBufferInfo or GetConsoleMode API calls fail. + + :param ctypes.windll.kernel32 kernel32: Loaded kernel32 instance. + :param int handle: stderr or stdout handle. + + :return: Foreground and background colors (integers) as well as native ANSI support (bool). + :rtype: tuple + """ + # Query Win32 API. + csbi = ConsoleScreenBufferInfo() # Populated by GetConsoleScreenBufferInfo. + lpcsbi = ctypes.byref(csbi) + dword = ctypes.c_ulong() # Populated by GetConsoleMode. + lpdword = ctypes.byref(dword) + if not kernel32.GetConsoleScreenBufferInfo(handle, lpcsbi) or not kernel32.GetConsoleMode(handle, lpdword): + raise ctypes.WinError() + + # Parse data. + # buffer_width = int(csbi.dwSize.X - 1) + # buffer_height = int(csbi.dwSize.Y) + # terminal_width = int(csbi.srWindow.Right - csbi.srWindow.Left) + # terminal_height = int(csbi.srWindow.Bottom - csbi.srWindow.Top) + fg_color = csbi.wAttributes % 16 + bg_color = csbi.wAttributes & 240 + native_ansi = bool(dword.value & ENABLE_VIRTUAL_TERMINAL_PROCESSING) + + return fg_color, bg_color, native_ansi + + +def bg_color_native_ansi(kernel32, stderr, stdout): + """Get background color and if console supports ANSI colors natively for both streams. + + :param ctypes.windll.kernel32 kernel32: Loaded kernel32 instance. + :param int stderr: stderr handle. + :param int stdout: stdout handle. + + :return: Background color (int) and native ANSI support (bool). + :rtype: tuple + """ + try: + if stderr == INVALID_HANDLE_VALUE: + raise OSError + bg_color, native_ansi = get_console_info(kernel32, stderr)[1:] + except OSError: + try: + if stdout == INVALID_HANDLE_VALUE: + raise OSError + bg_color, native_ansi = get_console_info(kernel32, stdout)[1:] + except OSError: + bg_color, native_ansi = WINDOWS_CODES['black'], False + return bg_color, native_ansi + + +class WindowsStream(object): + """Replacement stream which overrides sys.stdout or sys.stderr. When writing or printing, ANSI codes are converted. + + ANSI (Linux/Unix) color codes are converted into win32 system calls, changing the next character's color before + printing it. Resources referenced: + https://github.com/tartley/colorama + http://www.cplusplus.com/articles/2ywTURfi/ + http://thomasfischer.biz/python-and-windows-terminal-colors/ + http://stackoverflow.com/questions/17125440/c-win32-console-color + http://www.tysos.org/svn/trunk/mono/corlib/System/WindowsConsoleDriver.cs + http://stackoverflow.com/questions/287871/print-in-terminal-with-colors-using-python + http://msdn.microsoft.com/en-us/library/windows/desktop/ms682088#_win32_character_attributes + + :cvar list ALL_BG_CODES: List of bg Windows codes. Used to determine if requested color is foreground or background. + :cvar dict COMPILED_CODES: Translation dict. Keys are ANSI codes (values of BASE_CODES), values are Windows codes. + :ivar int default_fg: Foreground Windows color code at the time of instantiation. + :ivar int default_bg: Background Windows color code at the time of instantiation. + """ + + ALL_BG_CODES = [v for k, v in WINDOWS_CODES.items() if k.startswith('bg') or k.startswith('hibg')] + COMPILED_CODES = dict((v, WINDOWS_CODES[k]) for k, v in BASE_CODES.items() if k in WINDOWS_CODES) + + def __init__(self, kernel32, stream_handle, original_stream): + """Constructor. + + :param ctypes.windll.kernel32 kernel32: Loaded kernel32 instance. + :param int stream_handle: stderr or stdout handle. + :param original_stream: sys.stderr or sys.stdout before being overridden by this class' instance. + """ + self._kernel32 = kernel32 + self._stream_handle = stream_handle + self._original_stream = original_stream + self.default_fg, self.default_bg = self.colors + + def __getattr__(self, item): + """If an attribute/function/etc is not defined in this function, retrieve the one from the original stream. + + Fixes ipython arrow key presses. + """ + return getattr(self._original_stream, item) + + @property + def colors(self): + """Return the current foreground and background colors.""" + try: + return get_console_info(self._kernel32, self._stream_handle)[:2] + except OSError: + return WINDOWS_CODES['white'], WINDOWS_CODES['black'] + + @colors.setter + def colors(self, color_code): + """Change the foreground and background colors for subsequently printed characters. + + None resets colors to their original values (when class was instantiated). + + Since setting a color requires including both foreground and background codes (merged), setting just the + foreground color resets the background color to black, and vice versa. + + This function first gets the current background and foreground colors, merges in the requested color code, and + sets the result. + + However if we need to remove just the foreground color but leave the background color the same (or vice versa) + such as when {/red} is used, we must merge the default foreground color with the current background color. This + is the reason for those negative values. + + :param int color_code: Color code from WINDOWS_CODES. + """ + if color_code is None: + color_code = WINDOWS_CODES['/all'] + + # Get current color code. + current_fg, current_bg = self.colors + + # Handle special negative codes. Also determine the final color code. + if color_code == WINDOWS_CODES['/fg']: + final_color_code = self.default_fg | current_bg # Reset the foreground only. + elif color_code == WINDOWS_CODES['/bg']: + final_color_code = current_fg | self.default_bg # Reset the background only. + elif color_code == WINDOWS_CODES['/all']: + final_color_code = self.default_fg | self.default_bg # Reset both. + elif color_code == WINDOWS_CODES['bgblack']: + final_color_code = current_fg # Black background. + else: + new_is_bg = color_code in self.ALL_BG_CODES + final_color_code = color_code | (current_fg if new_is_bg else current_bg) + + # Set new code. + self._kernel32.SetConsoleTextAttribute(self._stream_handle, final_color_code) + + def write(self, p_str): + """Write to stream. + + :param str p_str: string to print. + """ + for segment in RE_SPLIT.split(p_str): + if not segment: + # Empty string. p_str probably starts with colors so the first item is always ''. + continue + if not RE_SPLIT.match(segment): + # No color codes, print regular text. + print(segment, file=self._original_stream, end='') + self._original_stream.flush() + continue + for color_code in (int(c) for c in RE_NUMBER_SEARCH.findall(segment)[0].split(';')): + if color_code in self.COMPILED_CODES: + self.colors = self.COMPILED_CODES[color_code] + + +class Windows(object): + """Enable and disable Windows support for ANSI color character codes. + + Call static method Windows.enable() to enable color support for the remainder of the process' lifetime. + + This class is also a context manager. You can do this: + with Windows(): + print(Color('{autored}Test{/autored}')) + + Or this: + with Windows(auto_colors=True): + print(Color('{autored}Test{/autored}')) + """ + + @classmethod + def disable(cls): + """Restore sys.stderr and sys.stdout to their original objects. Resets colors to their original values. + + :return: If streams restored successfully. + :rtype: bool + """ + # Skip if not on Windows. + if not IS_WINDOWS: + return False + + # Restore default colors. + if hasattr(sys.stderr, '_original_stream'): + getattr(sys, 'stderr').color = None + if hasattr(sys.stdout, '_original_stream'): + getattr(sys, 'stdout').color = None + + # Restore original streams. + changed = False + if hasattr(sys.stderr, '_original_stream'): + changed = True + sys.stderr = getattr(sys.stderr, '_original_stream') + if hasattr(sys.stdout, '_original_stream'): + changed = True + sys.stdout = getattr(sys.stdout, '_original_stream') + + return changed + + @staticmethod + def is_enabled(): + """Return True if either stderr or stdout has colors enabled.""" + return hasattr(sys.stderr, '_original_stream') or hasattr(sys.stdout, '_original_stream') + + @classmethod + def enable(cls, auto_colors=False, reset_atexit=False): + """Enable color text with print() or sys.stdout.write() (stderr too). + + :param bool auto_colors: Automatically selects dark or light colors based on current terminal's background + color. Only works with {autored} and related tags. + :param bool reset_atexit: Resets original colors upon Python exit (in case you forget to reset it yourself with + a closing tag). Does nothing on native ANSI consoles. + + :return: If streams replaced successfully. + :rtype: bool + """ + if not IS_WINDOWS: + return False # Windows only. + + # Get values from init_kernel32(). + kernel32, stderr, stdout = init_kernel32() + if stderr == INVALID_HANDLE_VALUE and stdout == INVALID_HANDLE_VALUE: + return False # No valid handles, nothing to do. + + # Get console info. + bg_color, native_ansi = bg_color_native_ansi(kernel32, stderr, stdout) + + # Set auto colors: + if auto_colors: + if bg_color in (112, 96, 240, 176, 224, 208, 160): + ANSICodeMapping.set_light_background() + else: + ANSICodeMapping.set_dark_background() + + # Don't replace streams if ANSI codes are natively supported. + if native_ansi: + return False + + # Reset on exit if requested. + if reset_atexit: + atexit.register(cls.disable) + + # Overwrite stream references. + if stderr != INVALID_HANDLE_VALUE: + sys.stderr.flush() + sys.stderr = WindowsStream(kernel32, stderr, sys.stderr) + if stdout != INVALID_HANDLE_VALUE: + sys.stdout.flush() + sys.stdout = WindowsStream(kernel32, stdout, sys.stdout) + + return True + + def __init__(self, auto_colors=False): + """Constructor.""" + self.auto_colors = auto_colors + + def __enter__(self): + """Context manager, enables colors on Windows.""" + self.enable(auto_colors=self.auto_colors) + + def __exit__(self, *_): + """Context manager, disabled colors on Windows.""" + self.disable() diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/DridexUrlDecoder.py remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/DridexUrlDecoder.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/DridexUrlDecoder.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/DridexUrlDecoder.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,42 @@ +# Written by @JamesHabben +# https://github.com/JamesHabben/MalwareStuff + +# 2015-01-27 Slight modifications from Philippe Lagadec (PL) to use it from olevba + +import sys + +def DridexUrlDecode (inputText) : + work = inputText[4:-4] + strKeyEnc = StripCharsWithZero(work[(len(work) / 2) - 2: (len(work) / 2)]) + strKeySize = StripCharsWithZero(work[(len(work) / 2): (len(work) / 2) + 2]) + nCharSize = strKeySize - strKeyEnc + work = work[:(len(work) / 2) - 2] + work[(len(work) / 2) + 2:] + strKeyEnc2 = StripChars(work[(len(work) / 2) - (nCharSize/2): (len(work) / 2) + (nCharSize/2)]) + work = work[:(len(work) / 2) - (nCharSize/2)] + work[(len(work) / 2) + (nCharSize/2):] + work_split = [work[i:i+nCharSize] for i in range(0, len(work), nCharSize)] + decoded = '' + for group in work_split: + # sys.stdout.write(chr(StripChars(group)/strKeyEnc2)) + decoded += chr(StripChars(group)/strKeyEnc2) + return decoded + +def StripChars (input) : + result = '' + for c in input : + if c.isdigit() : + result += c + return int(result) + +def StripCharsWithZero (input) : + result = '' + for c in input : + if c.isdigit() : + result += c + else: + result += '0' + return int(result) + + +# DridexUrlDecode("C3iY1epSRGe6q8g15xStVesdG717MAlg2H4hmV1vkL6Glnf0cknj") +# DridexUrlDecode("HLIY3Nf3z2k8jD37h1n2OM3N712DGQ3c5M841RZ8C5e6P1C50C4ym1oF504WyV182p4mJ16cK9Z61l47h2dU1rVB5V681sFY728i16H3E2Qm1fn47y2cgAo156j8T1s600hukKO1568X1xE4Z7d2q17jvcwgk816Yz32o9Q216Mpr0B01vcwg856a17b9j2zAmWf1536B1t7d92rI1FZ5E36Pu1jl504Z34tm2R43i55Lg2F3eLE3T28lLX1D504348Goe8Gbdp37w443ADy36X0h14g7Wb2G3u584kEG332Ut8ws3wO584pzSTf") +# DridexUrlDecode("YNPH1W47E211z3P6142cM4115K2J1696CURf1712N1OCJwc0w6Z16840Z1r600W16Z3273k6SR16Bf161Q92a016Vr16V1pc") diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/LICENSE.txt remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/LICENSE.txt --- remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/DridexUrlDecoder/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,3 @@ +DridexUrlDecoder.py is published by James Habben (@JamesHabben) +on https://github.com/JamesHabben/MalwareStuff +without explicit license. \ No newline at end of file diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/easygui.py remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/easygui.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/easygui.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/easygui.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,2492 @@ +""" +@version: 0.96(2010-08-29) + +@note: +ABOUT EASYGUI + +EasyGui provides an easy-to-use interface for simple GUI interaction +with a user. It does not require the programmer to know anything about +tkinter, frames, widgets, callbacks or lambda. All GUI interactions are +invoked by simple function calls that return results. + +@note: +WARNING about using EasyGui with IDLE + +You may encounter problems using IDLE to run programs that use EasyGui. Try it +and find out. EasyGui is a collection of Tkinter routines that run their own +event loops. IDLE is also a Tkinter application, with its own event loop. The +two may conflict, with unpredictable results. If you find that you have +problems, try running your EasyGui program outside of IDLE. + +Note that EasyGui requires Tk release 8.0 or greater. + +@note: +LICENSE INFORMATION + +EasyGui version 0.96 + +Copyright (c) 2010, Stephen Raymond Ferg + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + + 3. The name of the author may not be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +@note: +ABOUT THE EASYGUI LICENSE + +This license is what is generally known as the "modified BSD license", +aka "revised BSD", "new BSD", "3-clause BSD". +See http://www.opensource.org/licenses/bsd-license.php + +This license is GPL-compatible. +See http://en.wikipedia.org/wiki/License_compatibility +See http://www.gnu.org/licenses/license-list.html#GPLCompatibleLicenses + +The BSD License is less restrictive than GPL. +It allows software released under the license to be incorporated into proprietary products. +Works based on the software may be released under a proprietary license or as closed source software. +http://en.wikipedia.org/wiki/BSD_licenses#3-clause_license_.28.22New_BSD_License.22.29 + +""" +egversion = __doc__.split()[1] + +__all__ = ['ynbox' + , 'ccbox' + , 'boolbox' + , 'indexbox' + , 'msgbox' + , 'buttonbox' + , 'integerbox' + , 'multenterbox' + , 'enterbox' + , 'exceptionbox' + , 'choicebox' + , 'codebox' + , 'textbox' + , 'diropenbox' + , 'fileopenbox' + , 'filesavebox' + , 'passwordbox' + , 'multpasswordbox' + , 'multchoicebox' + , 'abouteasygui' + , 'egversion' + , 'egdemo' + , 'EgStore' + ] + +import sys, os +import string +import pickle +import traceback + + +#-------------------------------------------------- +# check python version and take appropriate action +#-------------------------------------------------- +""" +From the python documentation: + +sys.hexversion contains the version number encoded as a single integer. This is +guaranteed to increase with each version, including proper support for non- +production releases. For example, to test that the Python interpreter is at +least version 1.5.2, use: + +if sys.hexversion >= 0x010502F0: + # use some advanced feature + ... +else: + # use an alternative implementation or warn the user + ... +""" + + +if sys.hexversion >= 0x020600F0: + runningPython26 = True +else: + runningPython26 = False + +if sys.hexversion >= 0x030000F0: + runningPython3 = True +else: + runningPython3 = False + +try: + from PIL import Image as PILImage + from PIL import ImageTk as PILImageTk + PILisLoaded = True +except: + PILisLoaded = False + + +if runningPython3: + from tkinter import * + import tkinter.filedialog as tk_FileDialog + from io import StringIO +else: + from Tkinter import * + import tkFileDialog as tk_FileDialog + from StringIO import StringIO + +def write(*args): + args = [str(arg) for arg in args] + args = " ".join(args) + sys.stdout.write(args) + +def writeln(*args): + write(*args) + sys.stdout.write("\n") + +say = writeln + + +if TkVersion < 8.0 : + stars = "*"*75 + writeln("""\n\n\n""" + stars + """ +You are running Tk version: """ + str(TkVersion) + """ +You must be using Tk version 8.0 or greater to use EasyGui. +Terminating. +""" + stars + """\n\n\n""") + sys.exit(0) + +def dq(s): + return '"%s"' % s + +rootWindowPosition = "+300+200" + +PROPORTIONAL_FONT_FAMILY = ("MS", "Sans", "Serif") +MONOSPACE_FONT_FAMILY = ("Courier") + +PROPORTIONAL_FONT_SIZE = 10 +MONOSPACE_FONT_SIZE = 9 #a little smaller, because it it more legible at a smaller size +TEXT_ENTRY_FONT_SIZE = 12 # a little larger makes it easier to see + +#STANDARD_SELECTION_EVENTS = ["Return", "Button-1"] +STANDARD_SELECTION_EVENTS = ["Return", "Button-1", "space"] + +# Initialize some global variables that will be reset later +__choiceboxMultipleSelect = None +__widgetTexts = None +__replyButtonText = None +__choiceboxResults = None +__firstWidget = None +__enterboxText = None +__enterboxDefaultText="" +__multenterboxText = "" +choiceboxChoices = None +choiceboxWidget = None +entryWidget = None +boxRoot = None +ImageErrorMsg = ( + "\n\n---------------------------------------------\n" + "Error: %s\n%s") +#------------------------------------------------------------------- +# various boxes built on top of the basic buttonbox +#----------------------------------------------------------------------- + +#----------------------------------------------------------------------- +# ynbox +#----------------------------------------------------------------------- +def ynbox(msg="Shall I continue?" + , title=" " + , choices=("Yes", "No") + , image=None + ): + """ + Display a msgbox with choices of Yes and No. + + The default is "Yes". + + The returned value is calculated this way:: + if the first choice ("Yes") is chosen, or if the dialog is cancelled: + return 1 + else: + return 0 + + If invoked without a msg argument, displays a generic request for a confirmation + that the user wishes to continue. So it can be used this way:: + if ynbox(): pass # continue + else: sys.exit(0) # exit the program + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg choices: a list or tuple of the choices to be displayed + """ + return boolbox(msg, title, choices, image=image) + + +#----------------------------------------------------------------------- +# ccbox +#----------------------------------------------------------------------- +def ccbox(msg="Shall I continue?" + , title=" " + , choices=("Continue", "Cancel") + , image=None + ): + """ + Display a msgbox with choices of Continue and Cancel. + + The default is "Continue". + + The returned value is calculated this way:: + if the first choice ("Continue") is chosen, or if the dialog is cancelled: + return 1 + else: + return 0 + + If invoked without a msg argument, displays a generic request for a confirmation + that the user wishes to continue. So it can be used this way:: + + if ccbox(): + pass # continue + else: + sys.exit(0) # exit the program + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg choices: a list or tuple of the choices to be displayed + """ + return boolbox(msg, title, choices, image=image) + + +#----------------------------------------------------------------------- +# boolbox +#----------------------------------------------------------------------- +def boolbox(msg="Shall I continue?" + , title=" " + , choices=("Yes","No") + , image=None + ): + """ + Display a boolean msgbox. + + The default is the first choice. + + The returned value is calculated this way:: + if the first choice is chosen, or if the dialog is cancelled: + returns 1 + else: + returns 0 + """ + reply = buttonbox(msg=msg, choices=choices, title=title, image=image) + if reply == choices[0]: return 1 + else: return 0 + + +#----------------------------------------------------------------------- +# indexbox +#----------------------------------------------------------------------- +def indexbox(msg="Shall I continue?" + , title=" " + , choices=("Yes","No") + , image=None + ): + """ + Display a buttonbox with the specified choices. + Return the index of the choice selected. + """ + reply = buttonbox(msg=msg, choices=choices, title=title, image=image) + index = -1 + for choice in choices: + index = index + 1 + if reply == choice: return index + raise AssertionError( + "There is a program logic error in the EasyGui code for indexbox.") + + +#----------------------------------------------------------------------- +# msgbox +#----------------------------------------------------------------------- +def msgbox(msg="(Your message goes here)", title=" ", ok_button="OK",image=None,root=None): + """ + Display a messagebox + """ + if type(ok_button) != type("OK"): + raise AssertionError("The 'ok_button' argument to msgbox must be a string.") + + return buttonbox(msg=msg, title=title, choices=[ok_button], image=image,root=root) + + +#------------------------------------------------------------------- +# buttonbox +#------------------------------------------------------------------- +def buttonbox(msg="",title=" " + ,choices=("Button1", "Button2", "Button3") + , image=None + , root=None + ): + """ + Display a msg, a title, and a set of buttons. + The buttons are defined by the members of the choices list. + Return the text of the button that the user selected. + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg choices: a list or tuple of the choices to be displayed + """ + global boxRoot, __replyButtonText, __widgetTexts, buttonsFrame + + + # Initialize __replyButtonText to the first choice. + # This is what will be used if the window is closed by the close button. + __replyButtonText = choices[0] + + if root: + root.withdraw() + boxRoot = Toplevel(master=root) + boxRoot.withdraw() + else: + boxRoot = Tk() + boxRoot.withdraw() + + boxRoot.protocol('WM_DELETE_WINDOW', denyWindowManagerClose ) + boxRoot.title(title) + boxRoot.iconname('Dialog') + boxRoot.geometry(rootWindowPosition) + boxRoot.minsize(400, 100) + + # ------------- define the messageFrame --------------------------------- + messageFrame = Frame(master=boxRoot) + messageFrame.pack(side=TOP, fill=BOTH) + + # ------------- define the imageFrame --------------------------------- + tk_Image = None + if image: + imageFilename = os.path.normpath(image) + junk,ext = os.path.splitext(imageFilename) + + if os.path.exists(imageFilename): + if ext.lower() in [".gif", ".pgm", ".ppm"]: + tk_Image = PhotoImage(master=boxRoot, file=imageFilename) + else: + if PILisLoaded: + try: + pil_Image = PILImage.open(imageFilename) + tk_Image = PILImageTk.PhotoImage(pil_Image, master=boxRoot) + except: + msg += ImageErrorMsg % (imageFilename, + "\nThe Python Imaging Library (PIL) could not convert this file to a displayable image." + "\n\nPIL reports:\n" + exception_format()) + + else: # PIL is not loaded + msg += ImageErrorMsg % (imageFilename, + "\nI could not import the Python Imaging Library (PIL) to display the image.\n\n" + "You may need to install PIL\n" + "(http://www.pythonware.com/products/pil/)\n" + "to display " + ext + " image files.") + + else: + msg += ImageErrorMsg % (imageFilename, "\nImage file not found.") + + if tk_Image: + imageFrame = Frame(master=boxRoot) + imageFrame.pack(side=TOP, fill=BOTH) + label = Label(imageFrame,image=tk_Image) + label.image = tk_Image # keep a reference! + label.pack(side=TOP, expand=YES, fill=X, padx='1m', pady='1m') + + # ------------- define the buttonsFrame --------------------------------- + buttonsFrame = Frame(master=boxRoot) + buttonsFrame.pack(side=TOP, fill=BOTH) + + # -------------------- place the widgets in the frames ----------------------- + messageWidget = Message(messageFrame, text=msg, width=400) + messageWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + messageWidget.pack(side=TOP, expand=YES, fill=X, padx='3m', pady='3m') + + __put_buttons_in_buttonframe(choices) + + # -------------- the action begins ----------- + # put the focus on the first button + __firstWidget.focus_force() + + boxRoot.deiconify() + boxRoot.mainloop() + boxRoot.destroy() + if root: root.deiconify() + return __replyButtonText + + +#------------------------------------------------------------------- +# integerbox +#------------------------------------------------------------------- +def integerbox(msg="" + , title=" " + , default="" + , lowerbound=0 + , upperbound=99 + , image = None + , root = None + , **invalidKeywordArguments + ): + """ + Show a box in which a user can enter an integer. + + In addition to arguments for msg and title, this function accepts + integer arguments for "default", "lowerbound", and "upperbound". + + The default argument may be None. + + When the user enters some text, the text is checked to verify that it + can be converted to an integer between the lowerbound and upperbound. + + If it can be, the integer (not the text) is returned. + + If it cannot, then an error msg is displayed, and the integerbox is + redisplayed. + + If the user cancels the operation, None is returned. + + NOTE that the "argLowerBound" and "argUpperBound" arguments are no longer + supported. They have been replaced by "upperbound" and "lowerbound". + """ + if "argLowerBound" in invalidKeywordArguments: + raise AssertionError( + "\nintegerbox no longer supports the 'argLowerBound' argument.\n" + + "Use 'lowerbound' instead.\n\n") + if "argUpperBound" in invalidKeywordArguments: + raise AssertionError( + "\nintegerbox no longer supports the 'argUpperBound' argument.\n" + + "Use 'upperbound' instead.\n\n") + + if default != "": + if type(default) != type(1): + raise AssertionError( + "integerbox received a non-integer value for " + + "default of " + dq(str(default)) , "Error") + + if type(lowerbound) != type(1): + raise AssertionError( + "integerbox received a non-integer value for " + + "lowerbound of " + dq(str(lowerbound)) , "Error") + + if type(upperbound) != type(1): + raise AssertionError( + "integerbox received a non-integer value for " + + "upperbound of " + dq(str(upperbound)) , "Error") + + if msg == "": + msg = ("Enter an integer between " + str(lowerbound) + + " and " + + str(upperbound) + ) + + while 1: + reply = enterbox(msg, title, str(default), image=image, root=root) + if reply == None: return None + + try: + reply = int(reply) + except: + msgbox ("The value that you entered:\n\t%s\nis not an integer." % dq(str(reply)) + , "Error") + continue + + if reply < lowerbound: + msgbox ("The value that you entered is less than the lower bound of " + + str(lowerbound) + ".", "Error") + continue + + if reply > upperbound: + msgbox ("The value that you entered is greater than the upper bound of " + + str(upperbound) + ".", "Error") + continue + + # reply has passed all validation checks. + # It is an integer between the specified bounds. + return reply + +#------------------------------------------------------------------- +# multenterbox +#------------------------------------------------------------------- +def multenterbox(msg="Fill in values for the fields." + , title=" " + , fields=() + , values=() + ): + r""" + Show screen with multiple data entry fields. + + If there are fewer values than names, the list of values is padded with + empty strings until the number of values is the same as the number of names. + + If there are more values than names, the list of values + is truncated so that there are as many values as names. + + Returns a list of the values of the fields, + or None if the user cancels the operation. + + Here is some example code, that shows how values returned from + multenterbox can be checked for validity before they are accepted:: + ---------------------------------------------------------------------- + msg = "Enter your personal information" + title = "Credit Card Application" + fieldNames = ["Name","Street Address","City","State","ZipCode"] + fieldValues = [] # we start with blanks for the values + fieldValues = multenterbox(msg,title, fieldNames) + + # make sure that none of the fields was left blank + while 1: + if fieldValues == None: break + errmsg = "" + for i in range(len(fieldNames)): + if fieldValues[i].strip() == "": + errmsg += ('"%s" is a required field.\n\n' % fieldNames[i]) + if errmsg == "": + break # no problems found + fieldValues = multenterbox(errmsg, title, fieldNames, fieldValues) + + writeln("Reply was: %s" % str(fieldValues)) + ---------------------------------------------------------------------- + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg fields: a list of fieldnames. + @arg values: a list of field values + """ + return __multfillablebox(msg,title,fields,values,None) + + +#----------------------------------------------------------------------- +# multpasswordbox +#----------------------------------------------------------------------- +def multpasswordbox(msg="Fill in values for the fields." + , title=" " + , fields=tuple() + ,values=tuple() + ): + r""" + Same interface as multenterbox. But in multpassword box, + the last of the fields is assumed to be a password, and + is masked with asterisks. + + Example + ======= + + Here is some example code, that shows how values returned from + multpasswordbox can be checked for validity before they are accepted:: + msg = "Enter logon information" + title = "Demo of multpasswordbox" + fieldNames = ["Server ID", "User ID", "Password"] + fieldValues = [] # we start with blanks for the values + fieldValues = multpasswordbox(msg,title, fieldNames) + + # make sure that none of the fields was left blank + while 1: + if fieldValues == None: break + errmsg = "" + for i in range(len(fieldNames)): + if fieldValues[i].strip() == "": + errmsg = errmsg + ('"%s" is a required field.\n\n' % fieldNames[i]) + if errmsg == "": break # no problems found + fieldValues = multpasswordbox(errmsg, title, fieldNames, fieldValues) + + writeln("Reply was: %s" % str(fieldValues)) + """ + return __multfillablebox(msg,title,fields,values,"*") + +def bindArrows(widget): + widget.bind("", tabRight) + widget.bind("" , tabLeft) + + widget.bind("",tabRight) + widget.bind("" , tabLeft) + +def tabRight(event): + boxRoot.event_generate("") + +def tabLeft(event): + boxRoot.event_generate("") + +#----------------------------------------------------------------------- +# __multfillablebox +#----------------------------------------------------------------------- +def __multfillablebox(msg="Fill in values for the fields." + , title=" " + , fields=() + , values=() + , mask = None + ): + global boxRoot, __multenterboxText, __multenterboxDefaultText, cancelButton, entryWidget, okButton + + choices = ["OK", "Cancel"] + if len(fields) == 0: return None + + fields = list(fields[:]) # convert possible tuples to a list + values = list(values[:]) # convert possible tuples to a list + + if len(values) == len(fields): pass + elif len(values) > len(fields): + fields = fields[0:len(values)] + else: + while len(values) < len(fields): + values.append("") + + boxRoot = Tk() + + boxRoot.protocol('WM_DELETE_WINDOW', denyWindowManagerClose ) + boxRoot.title(title) + boxRoot.iconname('Dialog') + boxRoot.geometry(rootWindowPosition) + boxRoot.bind("", __multenterboxCancel) + + # -------------------- put subframes in the boxRoot -------------------- + messageFrame = Frame(master=boxRoot) + messageFrame.pack(side=TOP, fill=BOTH) + + #-------------------- the msg widget ---------------------------- + messageWidget = Message(messageFrame, width="4.5i", text=msg) + messageWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + messageWidget.pack(side=RIGHT, expand=1, fill=BOTH, padx='3m', pady='3m') + + global entryWidgets + entryWidgets = [] + + lastWidgetIndex = len(fields) - 1 + + for widgetIndex in range(len(fields)): + argFieldName = fields[widgetIndex] + argFieldValue = values[widgetIndex] + entryFrame = Frame(master=boxRoot) + entryFrame.pack(side=TOP, fill=BOTH) + + # --------- entryWidget ---------------------------------------------- + labelWidget = Label(entryFrame, text=argFieldName) + labelWidget.pack(side=LEFT) + + entryWidget = Entry(entryFrame, width=40,highlightthickness=2) + entryWidgets.append(entryWidget) + entryWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,TEXT_ENTRY_FONT_SIZE)) + entryWidget.pack(side=RIGHT, padx="3m") + + bindArrows(entryWidget) + + entryWidget.bind("", __multenterboxGetText) + entryWidget.bind("", __multenterboxCancel) + + # for the last entryWidget, if this is a multpasswordbox, + # show the contents as just asterisks + if widgetIndex == lastWidgetIndex: + if mask: + entryWidgets[widgetIndex].configure(show=mask) + + # put text into the entryWidget + entryWidgets[widgetIndex].insert(0,argFieldValue) + widgetIndex += 1 + + # ------------------ ok button ------------------------------- + buttonsFrame = Frame(master=boxRoot) + buttonsFrame.pack(side=BOTTOM, fill=BOTH) + + okButton = Button(buttonsFrame, takefocus=1, text="OK") + bindArrows(okButton) + okButton.pack(expand=1, side=LEFT, padx='3m', pady='3m', ipadx='2m', ipady='1m') + + # for the commandButton, bind activation events to the activation event handler + commandButton = okButton + handler = __multenterboxGetText + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + + + # ------------------ cancel button ------------------------------- + cancelButton = Button(buttonsFrame, takefocus=1, text="Cancel") + bindArrows(cancelButton) + cancelButton.pack(expand=1, side=RIGHT, padx='3m', pady='3m', ipadx='2m', ipady='1m') + + # for the commandButton, bind activation events to the activation event handler + commandButton = cancelButton + handler = __multenterboxCancel + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + + + # ------------------- time for action! ----------------- + entryWidgets[0].focus_force() # put the focus on the entryWidget + boxRoot.mainloop() # run it! + + # -------- after the run has completed ---------------------------------- + boxRoot.destroy() # button_click didn't destroy boxRoot, so we do it now + return __multenterboxText + + +#----------------------------------------------------------------------- +# __multenterboxGetText +#----------------------------------------------------------------------- +def __multenterboxGetText(event): + global __multenterboxText + + __multenterboxText = [] + for entryWidget in entryWidgets: + __multenterboxText.append(entryWidget.get()) + boxRoot.quit() + + +def __multenterboxCancel(event): + global __multenterboxText + __multenterboxText = None + boxRoot.quit() + + +#------------------------------------------------------------------- +# enterbox +#------------------------------------------------------------------- +def enterbox(msg="Enter something." + , title=" " + , default="" + , strip=True + , image=None + , root=None + ): + """ + Show a box in which a user can enter some text. + + You may optionally specify some default text, which will appear in the + enterbox when it is displayed. + + Returns the text that the user entered, or None if he cancels the operation. + + By default, enterbox strips its result (i.e. removes leading and trailing + whitespace). (If you want it not to strip, use keyword argument: strip=False.) + This makes it easier to test the results of the call:: + + reply = enterbox(....) + if reply: + ... + else: + ... + """ + result = __fillablebox(msg, title, default=default, mask=None,image=image,root=root) + if result and strip: + result = result.strip() + return result + + +def passwordbox(msg="Enter your password." + , title=" " + , default="" + , image=None + , root=None + ): + """ + Show a box in which a user can enter a password. + The text is masked with asterisks, so the password is not displayed. + Returns the text that the user entered, or None if he cancels the operation. + """ + return __fillablebox(msg, title, default, mask="*",image=image,root=root) + + +def __fillablebox(msg + , title="" + , default="" + , mask=None + , image=None + , root=None + ): + """ + Show a box in which a user can enter some text. + You may optionally specify some default text, which will appear in the + enterbox when it is displayed. + Returns the text that the user entered, or None if he cancels the operation. + """ + + global boxRoot, __enterboxText, __enterboxDefaultText + global cancelButton, entryWidget, okButton + + if title == None: title == "" + if default == None: default = "" + __enterboxDefaultText = default + __enterboxText = __enterboxDefaultText + + if root: + root.withdraw() + boxRoot = Toplevel(master=root) + boxRoot.withdraw() + else: + boxRoot = Tk() + boxRoot.withdraw() + + boxRoot.protocol('WM_DELETE_WINDOW', denyWindowManagerClose ) + boxRoot.title(title) + boxRoot.iconname('Dialog') + boxRoot.geometry(rootWindowPosition) + boxRoot.bind("", __enterboxCancel) + + # ------------- define the messageFrame --------------------------------- + messageFrame = Frame(master=boxRoot) + messageFrame.pack(side=TOP, fill=BOTH) + + # ------------- define the imageFrame --------------------------------- + tk_Image = None + if image: + imageFilename = os.path.normpath(image) + junk,ext = os.path.splitext(imageFilename) + + if os.path.exists(imageFilename): + if ext.lower() in [".gif", ".pgm", ".ppm"]: + tk_Image = PhotoImage(master=boxRoot, file=imageFilename) + else: + if PILisLoaded: + try: + pil_Image = PILImage.open(imageFilename) + tk_Image = PILImageTk.PhotoImage(pil_Image, master=boxRoot) + except: + msg += ImageErrorMsg % (imageFilename, + "\nThe Python Imaging Library (PIL) could not convert this file to a displayable image." + "\n\nPIL reports:\n" + exception_format()) + + else: # PIL is not loaded + msg += ImageErrorMsg % (imageFilename, + "\nI could not import the Python Imaging Library (PIL) to display the image.\n\n" + "You may need to install PIL\n" + "(http://www.pythonware.com/products/pil/)\n" + "to display " + ext + " image files.") + + else: + msg += ImageErrorMsg % (imageFilename, "\nImage file not found.") + + if tk_Image: + imageFrame = Frame(master=boxRoot) + imageFrame.pack(side=TOP, fill=BOTH) + label = Label(imageFrame,image=tk_Image) + label.image = tk_Image # keep a reference! + label.pack(side=TOP, expand=YES, fill=X, padx='1m', pady='1m') + + # ------------- define the buttonsFrame --------------------------------- + buttonsFrame = Frame(master=boxRoot) + buttonsFrame.pack(side=TOP, fill=BOTH) + + + # ------------- define the entryFrame --------------------------------- + entryFrame = Frame(master=boxRoot) + entryFrame.pack(side=TOP, fill=BOTH) + + # ------------- define the buttonsFrame --------------------------------- + buttonsFrame = Frame(master=boxRoot) + buttonsFrame.pack(side=TOP, fill=BOTH) + + #-------------------- the msg widget ---------------------------- + messageWidget = Message(messageFrame, width="4.5i", text=msg) + messageWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + messageWidget.pack(side=RIGHT, expand=1, fill=BOTH, padx='3m', pady='3m') + + # --------- entryWidget ---------------------------------------------- + entryWidget = Entry(entryFrame, width=40) + bindArrows(entryWidget) + entryWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,TEXT_ENTRY_FONT_SIZE)) + if mask: + entryWidget.configure(show=mask) + entryWidget.pack(side=LEFT, padx="3m") + entryWidget.bind("", __enterboxGetText) + entryWidget.bind("", __enterboxCancel) + # put text into the entryWidget + entryWidget.insert(0,__enterboxDefaultText) + + # ------------------ ok button ------------------------------- + okButton = Button(buttonsFrame, takefocus=1, text="OK") + bindArrows(okButton) + okButton.pack(expand=1, side=LEFT, padx='3m', pady='3m', ipadx='2m', ipady='1m') + + # for the commandButton, bind activation events to the activation event handler + commandButton = okButton + handler = __enterboxGetText + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + + + # ------------------ cancel button ------------------------------- + cancelButton = Button(buttonsFrame, takefocus=1, text="Cancel") + bindArrows(cancelButton) + cancelButton.pack(expand=1, side=RIGHT, padx='3m', pady='3m', ipadx='2m', ipady='1m') + + # for the commandButton, bind activation events to the activation event handler + commandButton = cancelButton + handler = __enterboxCancel + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + + # ------------------- time for action! ----------------- + entryWidget.focus_force() # put the focus on the entryWidget + boxRoot.deiconify() + boxRoot.mainloop() # run it! + + # -------- after the run has completed ---------------------------------- + if root: root.deiconify() + boxRoot.destroy() # button_click didn't destroy boxRoot, so we do it now + return __enterboxText + + +def __enterboxGetText(event): + global __enterboxText + + __enterboxText = entryWidget.get() + boxRoot.quit() + + +def __enterboxRestore(event): + global entryWidget + + entryWidget.delete(0,len(entryWidget.get())) + entryWidget.insert(0, __enterboxDefaultText) + + +def __enterboxCancel(event): + global __enterboxText + + __enterboxText = None + boxRoot.quit() + +def denyWindowManagerClose(): + """ don't allow WindowManager close + """ + x = Tk() + x.withdraw() + x.bell() + x.destroy() + + + +#------------------------------------------------------------------- +# multchoicebox +#------------------------------------------------------------------- +def multchoicebox(msg="Pick as many items as you like." + , title=" " + , choices=() + , **kwargs + ): + """ + Present the user with a list of choices. + allow him to select multiple items and return them in a list. + if the user doesn't choose anything from the list, return the empty list. + return None if he cancelled selection. + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg choices: a list or tuple of the choices to be displayed + """ + if len(choices) == 0: choices = ["Program logic error - no choices were specified."] + + global __choiceboxMultipleSelect + __choiceboxMultipleSelect = 1 + return __choicebox(msg, title, choices) + + +#----------------------------------------------------------------------- +# choicebox +#----------------------------------------------------------------------- +def choicebox(msg="Pick something." + , title=" " + , choices=() + ): + """ + Present the user with a list of choices. + return the choice that he selects. + return None if he cancels the selection selection. + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg choices: a list or tuple of the choices to be displayed + """ + if len(choices) == 0: choices = ["Program logic error - no choices were specified."] + + global __choiceboxMultipleSelect + __choiceboxMultipleSelect = 0 + return __choicebox(msg,title,choices) + + +#----------------------------------------------------------------------- +# __choicebox +#----------------------------------------------------------------------- +def __choicebox(msg + , title + , choices + ): + """ + internal routine to support choicebox() and multchoicebox() + """ + global boxRoot, __choiceboxResults, choiceboxWidget, defaultText + global choiceboxWidget, choiceboxChoices + #------------------------------------------------------------------- + # If choices is a tuple, we make it a list so we can sort it. + # If choices is already a list, we make a new list, so that when + # we sort the choices, we don't affect the list object that we + # were given. + #------------------------------------------------------------------- + choices = list(choices[:]) + if len(choices) == 0: + choices = ["Program logic error - no choices were specified."] + defaultButtons = ["OK", "Cancel"] + + # make sure all choices are strings + for index in range(len(choices)): + choices[index] = str(choices[index]) + + lines_to_show = min(len(choices), 20) + lines_to_show = 20 + + if title == None: title = "" + + # Initialize __choiceboxResults + # This is the value that will be returned if the user clicks the close icon + __choiceboxResults = None + + boxRoot = Tk() + boxRoot.protocol('WM_DELETE_WINDOW', denyWindowManagerClose ) + screen_width = boxRoot.winfo_screenwidth() + screen_height = boxRoot.winfo_screenheight() + root_width = int((screen_width * 0.8)) + root_height = int((screen_height * 0.5)) + root_xpos = int((screen_width * 0.1)) + root_ypos = int((screen_height * 0.05)) + + boxRoot.title(title) + boxRoot.iconname('Dialog') + rootWindowPosition = "+0+0" + boxRoot.geometry(rootWindowPosition) + boxRoot.expand=NO + boxRoot.minsize(root_width, root_height) + rootWindowPosition = "+" + str(root_xpos) + "+" + str(root_ypos) + boxRoot.geometry(rootWindowPosition) + + # ---------------- put the frames in the window ----------------------------------------- + message_and_buttonsFrame = Frame(master=boxRoot) + message_and_buttonsFrame.pack(side=TOP, fill=X, expand=NO) + + messageFrame = Frame(message_and_buttonsFrame) + messageFrame.pack(side=LEFT, fill=X, expand=YES) + #messageFrame.pack(side=TOP, fill=X, expand=YES) + + buttonsFrame = Frame(message_and_buttonsFrame) + buttonsFrame.pack(side=RIGHT, expand=NO, pady=0) + #buttonsFrame.pack(side=TOP, expand=YES, pady=0) + + choiceboxFrame = Frame(master=boxRoot) + choiceboxFrame.pack(side=BOTTOM, fill=BOTH, expand=YES) + + # -------------------------- put the widgets in the frames ------------------------------ + + # ---------- put a msg widget in the msg frame------------------- + messageWidget = Message(messageFrame, anchor=NW, text=msg, width=int(root_width * 0.9)) + messageWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + messageWidget.pack(side=LEFT, expand=YES, fill=BOTH, padx='1m', pady='1m') + + # -------- put the choiceboxWidget in the choiceboxFrame --------------------------- + choiceboxWidget = Listbox(choiceboxFrame + , height=lines_to_show + , borderwidth="1m" + , relief="flat" + , bg="white" + ) + + if __choiceboxMultipleSelect: + choiceboxWidget.configure(selectmode=MULTIPLE) + + choiceboxWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + + # add a vertical scrollbar to the frame + rightScrollbar = Scrollbar(choiceboxFrame, orient=VERTICAL, command=choiceboxWidget.yview) + choiceboxWidget.configure(yscrollcommand = rightScrollbar.set) + + # add a horizontal scrollbar to the frame + bottomScrollbar = Scrollbar(choiceboxFrame, orient=HORIZONTAL, command=choiceboxWidget.xview) + choiceboxWidget.configure(xscrollcommand = bottomScrollbar.set) + + # pack the Listbox and the scrollbars. Note that although we must define + # the textArea first, we must pack it last, so that the bottomScrollbar will + # be located properly. + + bottomScrollbar.pack(side=BOTTOM, fill = X) + rightScrollbar.pack(side=RIGHT, fill = Y) + + choiceboxWidget.pack(side=LEFT, padx="1m", pady="1m", expand=YES, fill=BOTH) + + #--------------------------------------------------- + # sort the choices + # eliminate duplicates + # put the choices into the choiceboxWidget + #--------------------------------------------------- + for index in range(len(choices)): + choices[index] = str(choices[index]) + + if runningPython3: + choices.sort(key=str.lower) + else: + choices.sort( lambda x,y: cmp(x.lower(), y.lower())) # case-insensitive sort + + lastInserted = None + choiceboxChoices = [] + for choice in choices: + if choice == lastInserted: pass + else: + choiceboxWidget.insert(END, choice) + choiceboxChoices.append(choice) + lastInserted = choice + + boxRoot.bind('', KeyboardListener) + + # put the buttons in the buttonsFrame + if len(choices) > 0: + okButton = Button(buttonsFrame, takefocus=YES, text="OK", height=1, width=6) + bindArrows(okButton) + okButton.pack(expand=NO, side=TOP, padx='2m', pady='1m', ipady="1m", ipadx="2m") + + # for the commandButton, bind activation events to the activation event handler + commandButton = okButton + handler = __choiceboxGetChoice + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + + # now bind the keyboard events + choiceboxWidget.bind("", __choiceboxGetChoice) + choiceboxWidget.bind("", __choiceboxGetChoice) + else: + # now bind the keyboard events + choiceboxWidget.bind("", __choiceboxCancel) + choiceboxWidget.bind("", __choiceboxCancel) + + cancelButton = Button(buttonsFrame, takefocus=YES, text="Cancel", height=1, width=6) + bindArrows(cancelButton) + cancelButton.pack(expand=NO, side=BOTTOM, padx='2m', pady='1m', ipady="1m", ipadx="2m") + + # for the commandButton, bind activation events to the activation event handler + commandButton = cancelButton + handler = __choiceboxCancel + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + + + # add special buttons for multiple select features + if len(choices) > 0 and __choiceboxMultipleSelect: + selectionButtonsFrame = Frame(messageFrame) + selectionButtonsFrame.pack(side=RIGHT, fill=Y, expand=NO) + + selectAllButton = Button(selectionButtonsFrame, text="Select All", height=1, width=6) + bindArrows(selectAllButton) + + selectAllButton.bind("",__choiceboxSelectAll) + selectAllButton.pack(expand=NO, side=TOP, padx='2m', pady='1m', ipady="1m", ipadx="2m") + + clearAllButton = Button(selectionButtonsFrame, text="Clear All", height=1, width=6) + bindArrows(clearAllButton) + clearAllButton.bind("",__choiceboxClearAll) + clearAllButton.pack(expand=NO, side=TOP, padx='2m', pady='1m', ipady="1m", ipadx="2m") + + + # -------------------- bind some keyboard events ---------------------------- + boxRoot.bind("", __choiceboxCancel) + + # --------------------- the action begins ----------------------------------- + # put the focus on the choiceboxWidget, and the select highlight on the first item + choiceboxWidget.select_set(0) + choiceboxWidget.focus_force() + + # --- run it! ----- + boxRoot.mainloop() + + boxRoot.destroy() + return __choiceboxResults + + +def __choiceboxGetChoice(event): + global boxRoot, __choiceboxResults, choiceboxWidget + + if __choiceboxMultipleSelect: + __choiceboxResults = [choiceboxWidget.get(index) for index in choiceboxWidget.curselection()] + + else: + choice_index = choiceboxWidget.curselection() + __choiceboxResults = choiceboxWidget.get(choice_index) + + # writeln("Debugging> mouse-event=", event, " event.type=", event.type) + # writeln("Debugging> choice=", choice_index, __choiceboxResults) + boxRoot.quit() + + +def __choiceboxSelectAll(event): + global choiceboxWidget, choiceboxChoices + + choiceboxWidget.selection_set(0, len(choiceboxChoices)-1) + +def __choiceboxClearAll(event): + global choiceboxWidget, choiceboxChoices + + choiceboxWidget.selection_clear(0, len(choiceboxChoices)-1) + + + +def __choiceboxCancel(event): + global boxRoot, __choiceboxResults + + __choiceboxResults = None + boxRoot.quit() + + +def KeyboardListener(event): + global choiceboxChoices, choiceboxWidget + key = event.keysym + if len(key) <= 1: + if key in string.printable: + # Find the key in the list. + # before we clear the list, remember the selected member + try: + start_n = int(choiceboxWidget.curselection()[0]) + except IndexError: + start_n = -1 + + ## clear the selection. + choiceboxWidget.selection_clear(0, 'end') + + ## start from previous selection +1 + for n in range(start_n+1, len(choiceboxChoices)): + item = choiceboxChoices[n] + if item[0].lower() == key.lower(): + choiceboxWidget.selection_set(first=n) + choiceboxWidget.see(n) + return + else: + # has not found it so loop from top + for n in range(len(choiceboxChoices)): + item = choiceboxChoices[n] + if item[0].lower() == key.lower(): + choiceboxWidget.selection_set(first = n) + choiceboxWidget.see(n) + return + + # nothing matched -- we'll look for the next logical choice + for n in range(len(choiceboxChoices)): + item = choiceboxChoices[n] + if item[0].lower() > key.lower(): + if n > 0: + choiceboxWidget.selection_set(first = (n-1)) + else: + choiceboxWidget.selection_set(first = 0) + choiceboxWidget.see(n) + return + + # still no match (nothing was greater than the key) + # we set the selection to the first item in the list + lastIndex = len(choiceboxChoices)-1 + choiceboxWidget.selection_set(first = lastIndex) + choiceboxWidget.see(lastIndex) + return + +#----------------------------------------------------------------------- +# exception_format +#----------------------------------------------------------------------- +def exception_format(): + """ + Convert exception info into a string suitable for display. + """ + return "".join(traceback.format_exception( + sys.exc_info()[0] + , sys.exc_info()[1] + , sys.exc_info()[2] + )) + +#----------------------------------------------------------------------- +# exceptionbox +#----------------------------------------------------------------------- +def exceptionbox(msg=None, title=None): + """ + Display a box that gives information about + an exception that has just been raised. + + The caller may optionally pass in a title for the window, or a + msg to accompany the error information. + + Note that you do not need to (and cannot) pass an exception object + as an argument. The latest exception will automatically be used. + """ + if title == None: title = "Error Report" + if msg == None: + msg = "An error (exception) has occurred in the program." + + codebox(msg, title, exception_format()) + +#------------------------------------------------------------------- +# codebox +#------------------------------------------------------------------- + +def codebox(msg="" + , title=" " + , text="" + ): + """ + Display some text in a monospaced font, with no line wrapping. + This function is suitable for displaying code and text that is + formatted using spaces. + + The text parameter should be a string, or a list or tuple of lines to be + displayed in the textbox. + """ + return textbox(msg, title, text, codebox=1 ) + +#------------------------------------------------------------------- +# textbox +#------------------------------------------------------------------- +def textbox(msg="" + , title=" " + , text="" + , codebox=0 + ): + """ + Display some text in a proportional font with line wrapping at word breaks. + This function is suitable for displaying general written text. + + The text parameter should be a string, or a list or tuple of lines to be + displayed in the textbox. + """ + + if msg == None: msg = "" + if title == None: title = "" + + global boxRoot, __replyButtonText, __widgetTexts, buttonsFrame + global rootWindowPosition + choices = ["OK"] + __replyButtonText = choices[0] + + + boxRoot = Tk() + + boxRoot.protocol('WM_DELETE_WINDOW', denyWindowManagerClose ) + + screen_width = boxRoot.winfo_screenwidth() + screen_height = boxRoot.winfo_screenheight() + root_width = int((screen_width * 0.8)) + root_height = int((screen_height * 0.5)) + root_xpos = int((screen_width * 0.1)) + root_ypos = int((screen_height * 0.05)) + + boxRoot.title(title) + boxRoot.iconname('Dialog') + rootWindowPosition = "+0+0" + boxRoot.geometry(rootWindowPosition) + boxRoot.expand=NO + boxRoot.minsize(root_width, root_height) + rootWindowPosition = "+" + str(root_xpos) + "+" + str(root_ypos) + boxRoot.geometry(rootWindowPosition) + + mainframe = Frame(master=boxRoot) + mainframe.pack(side=TOP, fill=BOTH, expand=YES) + + # ---- put frames in the window ----------------------------------- + # we pack the textboxFrame first, so it will expand first + textboxFrame = Frame(mainframe, borderwidth=3) + textboxFrame.pack(side=BOTTOM , fill=BOTH, expand=YES) + + message_and_buttonsFrame = Frame(mainframe) + message_and_buttonsFrame.pack(side=TOP, fill=X, expand=NO) + + messageFrame = Frame(message_and_buttonsFrame) + messageFrame.pack(side=LEFT, fill=X, expand=YES) + + buttonsFrame = Frame(message_and_buttonsFrame) + buttonsFrame.pack(side=RIGHT, expand=NO) + + # -------------------- put widgets in the frames -------------------- + + # put a textArea in the top frame + if codebox: + character_width = int((root_width * 0.6) / MONOSPACE_FONT_SIZE) + textArea = Text(textboxFrame,height=25,width=character_width, padx="2m", pady="1m") + textArea.configure(wrap=NONE) + textArea.configure(font=(MONOSPACE_FONT_FAMILY, MONOSPACE_FONT_SIZE)) + + else: + character_width = int((root_width * 0.6) / MONOSPACE_FONT_SIZE) + textArea = Text( + textboxFrame + , height=25 + , width=character_width + , padx="2m" + , pady="1m" + ) + textArea.configure(wrap=WORD) + textArea.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + + + # some simple keybindings for scrolling + mainframe.bind("" , textArea.yview_scroll( 1,PAGES)) + mainframe.bind("", textArea.yview_scroll(-1,PAGES)) + + mainframe.bind("", textArea.xview_scroll( 1,PAGES)) + mainframe.bind("" , textArea.xview_scroll(-1,PAGES)) + + mainframe.bind("", textArea.yview_scroll( 1,UNITS)) + mainframe.bind("" , textArea.yview_scroll(-1,UNITS)) + + + # add a vertical scrollbar to the frame + rightScrollbar = Scrollbar(textboxFrame, orient=VERTICAL, command=textArea.yview) + textArea.configure(yscrollcommand = rightScrollbar.set) + + # add a horizontal scrollbar to the frame + bottomScrollbar = Scrollbar(textboxFrame, orient=HORIZONTAL, command=textArea.xview) + textArea.configure(xscrollcommand = bottomScrollbar.set) + + # pack the textArea and the scrollbars. Note that although we must define + # the textArea first, we must pack it last, so that the bottomScrollbar will + # be located properly. + + # Note that we need a bottom scrollbar only for code. + # Text will be displayed with wordwrap, so we don't need to have a horizontal + # scroll for it. + if codebox: + bottomScrollbar.pack(side=BOTTOM, fill=X) + rightScrollbar.pack(side=RIGHT, fill=Y) + + textArea.pack(side=LEFT, fill=BOTH, expand=YES) + + + # ---------- put a msg widget in the msg frame------------------- + messageWidget = Message(messageFrame, anchor=NW, text=msg, width=int(root_width * 0.9)) + messageWidget.configure(font=(PROPORTIONAL_FONT_FAMILY,PROPORTIONAL_FONT_SIZE)) + messageWidget.pack(side=LEFT, expand=YES, fill=BOTH, padx='1m', pady='1m') + + # put the buttons in the buttonsFrame + okButton = Button(buttonsFrame, takefocus=YES, text="OK", height=1, width=6) + okButton.pack(expand=NO, side=TOP, padx='2m', pady='1m', ipady="1m", ipadx="2m") + + # for the commandButton, bind activation events to the activation event handler + commandButton = okButton + handler = __textboxOK + for selectionEvent in ["Return","Button-1","Escape"]: + commandButton.bind("<%s>" % selectionEvent, handler) + + + # ----------------- the action begins ---------------------------------------- + try: + # load the text into the textArea + if type(text) == type("abc"): pass + else: + try: + text = "".join(text) # convert a list or a tuple to a string + except: + msgbox("Exception when trying to convert "+ str(type(text)) + " to text in textArea") + sys.exit(16) + textArea.insert(END,text, "normal") + + except: + msgbox("Exception when trying to load the textArea.") + sys.exit(16) + + try: + okButton.focus_force() + except: + msgbox("Exception when trying to put focus on okButton.") + sys.exit(16) + + boxRoot.mainloop() + + # this line MUST go before the line that destroys boxRoot + areaText = textArea.get(0.0,END) + boxRoot.destroy() + return areaText # return __replyButtonText + +#------------------------------------------------------------------- +# __textboxOK +#------------------------------------------------------------------- +def __textboxOK(event): + global boxRoot + boxRoot.quit() + + + +#------------------------------------------------------------------- +# diropenbox +#------------------------------------------------------------------- +def diropenbox(msg=None + , title=None + , default=None + ): + """ + A dialog to get a directory name. + Note that the msg argument, if specified, is ignored. + + Returns the name of a directory, or None if user chose to cancel. + + If the "default" argument specifies a directory name, and that + directory exists, then the dialog box will start with that directory. + """ + title=getFileDialogTitle(msg,title) + localRoot = Tk() + localRoot.withdraw() + if not default: default = None + f = tk_FileDialog.askdirectory( + parent=localRoot + , title=title + , initialdir=default + , initialfile=None + ) + localRoot.destroy() + if not f: return None + return os.path.normpath(f) + + + +#------------------------------------------------------------------- +# getFileDialogTitle +#------------------------------------------------------------------- +def getFileDialogTitle(msg + , title + ): + if msg and title: return "%s - %s" % (title,msg) + if msg and not title: return str(msg) + if title and not msg: return str(title) + return None # no message and no title + +#------------------------------------------------------------------- +# class FileTypeObject for use with fileopenbox +#------------------------------------------------------------------- +class FileTypeObject: + def __init__(self,filemask): + if len(filemask) == 0: + raise AssertionError('Filetype argument is empty.') + + self.masks = [] + + if type(filemask) == type("abc"): # a string + self.initializeFromString(filemask) + + elif type(filemask) == type([]): # a list + if len(filemask) < 2: + raise AssertionError('Invalid filemask.\n' + +'List contains less than 2 members: "%s"' % filemask) + else: + self.name = filemask[-1] + self.masks = list(filemask[:-1] ) + else: + raise AssertionError('Invalid filemask: "%s"' % filemask) + + def __eq__(self,other): + if self.name == other.name: return True + return False + + def add(self,other): + for mask in other.masks: + if mask in self.masks: pass + else: self.masks.append(mask) + + def toTuple(self): + return (self.name,tuple(self.masks)) + + def isAll(self): + if self.name == "All files": return True + return False + + def initializeFromString(self, filemask): + # remove everything except the extension from the filemask + self.ext = os.path.splitext(filemask)[1] + if self.ext == "" : self.ext = ".*" + if self.ext == ".": self.ext = ".*" + self.name = self.getName() + self.masks = ["*" + self.ext] + + def getName(self): + e = self.ext + if e == ".*" : return "All files" + if e == ".txt": return "Text files" + if e == ".py" : return "Python files" + if e == ".pyc" : return "Python files" + if e == ".xls": return "Excel files" + if e.startswith("."): + return e[1:].upper() + " files" + return e.upper() + " files" + + +#------------------------------------------------------------------- +# fileopenbox +#------------------------------------------------------------------- +def fileopenbox(msg=None + , title=None + , default="*" + , filetypes=None + ): + """ + A dialog to get a file name. + + About the "default" argument + ============================ + The "default" argument specifies a filepath that (normally) + contains one or more wildcards. + fileopenbox will display only files that match the default filepath. + If omitted, defaults to "*" (all files in the current directory). + + WINDOWS EXAMPLE:: + ...default="c:/myjunk/*.py" + will open in directory c:\myjunk\ and show all Python files. + + WINDOWS EXAMPLE:: + ...default="c:/myjunk/test*.py" + will open in directory c:\myjunk\ and show all Python files + whose names begin with "test". + + + Note that on Windows, fileopenbox automatically changes the path + separator to the Windows path separator (backslash). + + About the "filetypes" argument + ============================== + If specified, it should contain a list of items, + where each item is either:: + - a string containing a filemask # e.g. "*.txt" + - a list of strings, where all of the strings except the last one + are filemasks (each beginning with "*.", + such as "*.txt" for text files, "*.py" for Python files, etc.). + and the last string contains a filetype description + + EXAMPLE:: + filetypes = ["*.css", ["*.htm", "*.html", "HTML files"] ] + + NOTE THAT + ========= + + If the filetypes list does not contain ("All files","*"), + it will be added. + + If the filetypes list does not contain a filemask that includes + the extension of the "default" argument, it will be added. + For example, if default="*abc.py" + and no filetypes argument was specified, then + "*.py" will automatically be added to the filetypes argument. + + @rtype: string or None + @return: the name of a file, or None if user chose to cancel + + @arg msg: the msg to be displayed. + @arg title: the window title + @arg default: filepath with wildcards + @arg filetypes: filemasks that a user can choose, e.g. "*.txt" + """ + localRoot = Tk() + localRoot.withdraw() + + initialbase, initialfile, initialdir, filetypes = fileboxSetup(default,filetypes) + + #------------------------------------------------------------ + # if initialfile contains no wildcards; we don't want an + # initial file. It won't be used anyway. + # Also: if initialbase is simply "*", we don't want an + # initialfile; it is not doing any useful work. + #------------------------------------------------------------ + if (initialfile.find("*") < 0) and (initialfile.find("?") < 0): + initialfile = None + elif initialbase == "*": + initialfile = None + + f = tk_FileDialog.askopenfilename(parent=localRoot + , title=getFileDialogTitle(msg,title) + , initialdir=initialdir + , initialfile=initialfile + , filetypes=filetypes + ) + + localRoot.destroy() + + if not f: return None + return os.path.normpath(f) + + +#------------------------------------------------------------------- +# filesavebox +#------------------------------------------------------------------- +def filesavebox(msg=None + , title=None + , default="" + , filetypes=None + ): + """ + A file to get the name of a file to save. + Returns the name of a file, or None if user chose to cancel. + + The "default" argument should contain a filename (i.e. the + current name of the file to be saved). It may also be empty, + or contain a filemask that includes wildcards. + + The "filetypes" argument works like the "filetypes" argument to + fileopenbox. + """ + + localRoot = Tk() + localRoot.withdraw() + + initialbase, initialfile, initialdir, filetypes = fileboxSetup(default,filetypes) + + f = tk_FileDialog.asksaveasfilename(parent=localRoot + , title=getFileDialogTitle(msg,title) + , initialfile=initialfile + , initialdir=initialdir + , filetypes=filetypes + ) + localRoot.destroy() + if not f: return None + return os.path.normpath(f) + + +#------------------------------------------------------------------- +# +# fileboxSetup +# +#------------------------------------------------------------------- +def fileboxSetup(default,filetypes): + if not default: default = os.path.join(".","*") + initialdir, initialfile = os.path.split(default) + if not initialdir : initialdir = "." + if not initialfile: initialfile = "*" + initialbase, initialext = os.path.splitext(initialfile) + initialFileTypeObject = FileTypeObject(initialfile) + + allFileTypeObject = FileTypeObject("*") + ALL_filetypes_was_specified = False + + if not filetypes: filetypes= [] + filetypeObjects = [] + + for filemask in filetypes: + fto = FileTypeObject(filemask) + + if fto.isAll(): + ALL_filetypes_was_specified = True # remember this + + if fto == initialFileTypeObject: + initialFileTypeObject.add(fto) # add fto to initialFileTypeObject + else: + filetypeObjects.append(fto) + + #------------------------------------------------------------------ + # make sure that the list of filetypes includes the ALL FILES type. + #------------------------------------------------------------------ + if ALL_filetypes_was_specified: + pass + elif allFileTypeObject == initialFileTypeObject: + pass + else: + filetypeObjects.insert(0,allFileTypeObject) + #------------------------------------------------------------------ + # Make sure that the list includes the initialFileTypeObject + # in the position in the list that will make it the default. + # This changed between Python version 2.5 and 2.6 + #------------------------------------------------------------------ + if len(filetypeObjects) == 0: + filetypeObjects.append(initialFileTypeObject) + + if initialFileTypeObject in (filetypeObjects[0], filetypeObjects[-1]): + pass + else: + if runningPython26: + filetypeObjects.append(initialFileTypeObject) + else: + filetypeObjects.insert(0,initialFileTypeObject) + + filetypes = [fto.toTuple() for fto in filetypeObjects] + + return initialbase, initialfile, initialdir, filetypes + +#------------------------------------------------------------------- +# utility routines +#------------------------------------------------------------------- +# These routines are used by several other functions in the EasyGui module. + +def __buttonEvent(event): + """ + Handle an event that is generated by a person clicking a button. + """ + global boxRoot, __widgetTexts, __replyButtonText + __replyButtonText = __widgetTexts[event.widget] + boxRoot.quit() # quit the main loop + + +def __put_buttons_in_buttonframe(choices): + """Put the buttons in the buttons frame + """ + global __widgetTexts, __firstWidget, buttonsFrame + + __firstWidget = None + __widgetTexts = {} + + i = 0 + + for buttonText in choices: + tempButton = Button(buttonsFrame, takefocus=1, text=buttonText) + bindArrows(tempButton) + tempButton.pack(expand=YES, side=LEFT, padx='1m', pady='1m', ipadx='2m', ipady='1m') + + # remember the text associated with this widget + __widgetTexts[tempButton] = buttonText + + # remember the first widget, so we can put the focus there + if i == 0: + __firstWidget = tempButton + i = 1 + + # for the commandButton, bind activation events to the activation event handler + commandButton = tempButton + handler = __buttonEvent + for selectionEvent in STANDARD_SELECTION_EVENTS: + commandButton.bind("<%s>" % selectionEvent, handler) + +#----------------------------------------------------------------------- +# +# class EgStore +# +#----------------------------------------------------------------------- +class EgStore: + r""" +A class to support persistent storage. + +You can use EgStore to support the storage and retrieval +of user settings for an EasyGui application. + + +# Example A +#----------------------------------------------------------------------- +# define a class named Settings as a subclass of EgStore +#----------------------------------------------------------------------- +class Settings(EgStore): +:: + def __init__(self, filename): # filename is required + #------------------------------------------------- + # Specify default/initial values for variables that + # this particular application wants to remember. + #------------------------------------------------- + self.userId = "" + self.targetServer = "" + + #------------------------------------------------- + # For subclasses of EgStore, these must be + # the last two statements in __init__ + #------------------------------------------------- + self.filename = filename # this is required + self.restore() # restore values from the storage file if possible + + + +# Example B +#----------------------------------------------------------------------- +# create settings, a persistent Settings object +#----------------------------------------------------------------------- +settingsFile = "myApp_settings.txt" +settings = Settings(settingsFile) + +user = "obama_barak" +server = "whitehouse1" +settings.userId = user +settings.targetServer = server +settings.store() # persist the settings + +# run code that gets a new value for userId, and persist the settings +user = "biden_joe" +settings.userId = user +settings.store() + + +# Example C +#----------------------------------------------------------------------- +# recover the Settings instance, change an attribute, and store it again. +#----------------------------------------------------------------------- +settings = Settings(settingsFile) +settings.userId = "vanrossum_g" +settings.store() + +""" + def __init__(self, filename): # obtaining filename is required + self.filename = None + raise NotImplementedError() + + def restore(self): + """ + Set the values of whatever attributes are recoverable + from the pickle file. + + Populate the attributes (the __dict__) of the EgStore object + from the attributes (the __dict__) of the pickled object. + + If the pickled object has attributes that have been initialized + in the EgStore object, then those attributes of the EgStore object + will be replaced by the values of the corresponding attributes + in the pickled object. + + If the pickled object is missing some attributes that have + been initialized in the EgStore object, then those attributes + of the EgStore object will retain the values that they were + initialized with. + + If the pickled object has some attributes that were not + initialized in the EgStore object, then those attributes + will be ignored. + + IN SUMMARY: + + After the recover() operation, the EgStore object will have all, + and only, the attributes that it had when it was initialized. + + Where possible, those attributes will have values recovered + from the pickled object. + """ + if not os.path.exists(self.filename): return self + if not os.path.isfile(self.filename): return self + + try: + f = open(self.filename,"rb") + unpickledObject = pickle.load(f) + f.close() + + for key in list(self.__dict__.keys()): + default = self.__dict__[key] + self.__dict__[key] = unpickledObject.__dict__.get(key,default) + except: + pass + + return self + + def store(self): + """ + Save the attributes of the EgStore object to a pickle file. + Note that if the directory for the pickle file does not already exist, + the store operation will fail. + """ + f = open(self.filename, "wb") + pickle.dump(self, f) + f.close() + + + def kill(self): + """ + Delete my persistent file (i.e. pickle file), if it exists. + """ + if os.path.isfile(self.filename): + os.remove(self.filename) + return + + def __str__(self): + """ + return my contents as a string in an easy-to-read format. + """ + # find the length of the longest attribute name + longest_key_length = 0 + keys = [] + for key in self.__dict__.keys(): + keys.append(key) + longest_key_length = max(longest_key_length, len(key)) + + keys.sort() # sort the attribute names + lines = [] + for key in keys: + value = self.__dict__[key] + key = key.ljust(longest_key_length) + lines.append("%s : %s\n" % (key,repr(value)) ) + return "".join(lines) # return a string showing the attributes + + + + +#----------------------------------------------------------------------- +# +# test/demo easygui +# +#----------------------------------------------------------------------- +def egdemo(): + """ + Run the EasyGui demo. + """ + # clear the console + writeln("\n" * 100) + + intro_message = ("Pick the kind of box that you wish to demo.\n" + + "\n * Python version " + sys.version + + "\n * EasyGui version " + egversion + + "\n * Tk version " + str(TkVersion) + ) + + #========================================== END DEMONSTRATION DATA + + + while 1: # do forever + choices = [ + "msgbox", + "buttonbox", + "buttonbox(image) -- a buttonbox that displays an image", + "choicebox", + "multchoicebox", + "textbox", + "ynbox", + "ccbox", + "enterbox", + "enterbox(image) -- an enterbox that displays an image", + "exceptionbox", + "codebox", + "integerbox", + "boolbox", + "indexbox", + "filesavebox", + "fileopenbox", + "passwordbox", + "multenterbox", + "multpasswordbox", + "diropenbox", + "About EasyGui", + " Help" + ] + choice = choicebox(msg=intro_message + , title="EasyGui " + egversion + , choices=choices) + + if not choice: return + + reply = choice.split() + + if reply[0] == "msgbox": + reply = msgbox("short msg", "This is a long title") + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "About": + reply = abouteasygui() + + elif reply[0] == "Help": + _demo_help() + + elif reply[0] == "buttonbox": + reply = buttonbox() + writeln("Reply was: %s" % repr(reply)) + + title = "Demo of Buttonbox with many, many buttons!" + msg = "This buttonbox shows what happens when you specify too many buttons." + reply = buttonbox(msg=msg, title=title, choices=choices) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "buttonbox(image)": + _demo_buttonbox_with_image() + + elif reply[0] == "boolbox": + reply = boolbox() + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "enterbox": + image = "python_and_check_logo.gif" + message = "Enter the name of your best friend."\ + "\n(Result will be stripped.)" + reply = enterbox(message, "Love!", " Suzy Smith ") + writeln("Reply was: %s" % repr(reply)) + + message = "Enter the name of your best friend."\ + "\n(Result will NOT be stripped.)" + reply = enterbox(message, "Love!", " Suzy Smith ",strip=False) + writeln("Reply was: %s" % repr(reply)) + + reply = enterbox("Enter the name of your worst enemy:", "Hate!") + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "enterbox(image)": + image = "python_and_check_logo.gif" + message = "What kind of snake is this?" + reply = enterbox(message, "Quiz",image=image) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "exceptionbox": + try: + thisWillCauseADivideByZeroException = 1/0 + except: + exceptionbox() + + elif reply[0] == "integerbox": + reply = integerbox( + "Enter a number between 3 and 333", + "Demo: integerbox WITH a default value", + 222, 3, 333) + writeln("Reply was: %s" % repr(reply)) + + reply = integerbox( + "Enter a number between 0 and 99", + "Demo: integerbox WITHOUT a default value" + ) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "diropenbox" : _demo_diropenbox() + elif reply[0] == "fileopenbox": _demo_fileopenbox() + elif reply[0] == "filesavebox": _demo_filesavebox() + + elif reply[0] == "indexbox": + title = reply[0] + msg = "Demo of " + reply[0] + choices = ["Choice1", "Choice2", "Choice3", "Choice4"] + reply = indexbox(msg, title, choices) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "passwordbox": + reply = passwordbox("Demo of password box WITHOUT default" + + "\n\nEnter your secret password", "Member Logon") + writeln("Reply was: %s" % str(reply)) + + reply = passwordbox("Demo of password box WITH default" + + "\n\nEnter your secret password", "Member Logon", "alfie") + writeln("Reply was: %s" % str(reply)) + + elif reply[0] == "multenterbox": + msg = "Enter your personal information" + title = "Credit Card Application" + fieldNames = ["Name","Street Address","City","State","ZipCode"] + fieldValues = [] # we start with blanks for the values + fieldValues = multenterbox(msg,title, fieldNames) + + # make sure that none of the fields was left blank + while 1: + if fieldValues == None: break + errmsg = "" + for i in range(len(fieldNames)): + if fieldValues[i].strip() == "": + errmsg = errmsg + ('"%s" is a required field.\n\n' % fieldNames[i]) + if errmsg == "": break # no problems found + fieldValues = multenterbox(errmsg, title, fieldNames, fieldValues) + + writeln("Reply was: %s" % str(fieldValues)) + + elif reply[0] == "multpasswordbox": + msg = "Enter logon information" + title = "Demo of multpasswordbox" + fieldNames = ["Server ID", "User ID", "Password"] + fieldValues = [] # we start with blanks for the values + fieldValues = multpasswordbox(msg,title, fieldNames) + + # make sure that none of the fields was left blank + while 1: + if fieldValues == None: break + errmsg = "" + for i in range(len(fieldNames)): + if fieldValues[i].strip() == "": + errmsg = errmsg + ('"%s" is a required field.\n\n' % fieldNames[i]) + if errmsg == "": break # no problems found + fieldValues = multpasswordbox(errmsg, title, fieldNames, fieldValues) + + writeln("Reply was: %s" % str(fieldValues)) + + elif reply[0] == "ynbox": + title = "Demo of ynbox" + msg = "Were you expecting the Spanish Inquisition?" + reply = ynbox(msg, title) + writeln("Reply was: %s" % repr(reply)) + if reply: + msgbox("NOBODY expects the Spanish Inquisition!", "Wrong!") + + elif reply[0] == "ccbox": + title = "Demo of ccbox" + reply = ccbox(msg,title) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "choicebox": + title = "Demo of choicebox" + longchoice = "This is an example of a very long option which you may or may not wish to choose."*2 + listChoices = ["nnn", "ddd", "eee", "fff", "aaa", longchoice + , "aaa", "bbb", "ccc", "ggg", "hhh", "iii", "jjj", "kkk", "LLL", "mmm" , "nnn", "ooo", "ppp", "qqq", "rrr", "sss", "ttt", "uuu", "vvv"] + + msg = "Pick something. " + ("A wrapable sentence of text ?! "*30) + "\nA separate line of text."*6 + reply = choicebox(msg=msg, choices=listChoices) + writeln("Reply was: %s" % repr(reply)) + + msg = "Pick something. " + reply = choicebox(msg=msg, title=title, choices=listChoices) + writeln("Reply was: %s" % repr(reply)) + + msg = "Pick something. " + reply = choicebox(msg="The list of choices is empty!", choices=[]) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "multchoicebox": + listChoices = ["aaa", "bbb", "ccc", "ggg", "hhh", "iii", "jjj", "kkk" + , "LLL", "mmm" , "nnn", "ooo", "ppp", "qqq" + , "rrr", "sss", "ttt", "uuu", "vvv"] + + msg = "Pick as many choices as you wish." + reply = multchoicebox(msg,"Demo of multchoicebox", listChoices) + writeln("Reply was: %s" % repr(reply)) + + elif reply[0] == "textbox": _demo_textbox(reply[0]) + elif reply[0] == "codebox": _demo_codebox(reply[0]) + + else: + msgbox("Choice\n\n" + choice + "\n\nis not recognized", "Program Logic Error") + return + + +def _demo_textbox(reply): + text_snippet = ((\ +"""It was the best of times, and it was the worst of times. The rich ate cake, and the poor had cake recommended to them, but wished only for enough cash to buy bread. The time was ripe for revolution! """ \ +*5)+"\n\n")*10 + title = "Demo of textbox" + msg = "Here is some sample text. " * 16 + reply = textbox(msg, title, text_snippet) + writeln("Reply was: %s" % str(reply)) + +def _demo_codebox(reply): + code_snippet = ("dafsdfa dasflkj pp[oadsij asdfp;ij asdfpjkop asdfpok asdfpok asdfpok"*3) +"\n"+\ +"""# here is some dummy Python code +for someItem in myListOfStuff: + do something(someItem) + do something() + do something() + if somethingElse(someItem): + doSomethingEvenMoreInteresting() + +"""*16 + msg = "Here is some sample code. " * 16 + reply = codebox(msg, "Code Sample", code_snippet) + writeln("Reply was: %s" % repr(reply)) + + +def _demo_buttonbox_with_image(): + + msg = "Do you like this picture?\nIt is " + choices = ["Yes","No","No opinion"] + + for image in [ + "python_and_check_logo.gif" + ,"python_and_check_logo.jpg" + ,"python_and_check_logo.png" + ,"zzzzz.gif"]: + + reply=buttonbox(msg + image,image=image,choices=choices) + writeln("Reply was: %s" % repr(reply)) + + +def _demo_help(): + savedStdout = sys.stdout # save the sys.stdout file object + sys.stdout = capturedOutput = StringIO() + help("easygui") + sys.stdout = savedStdout # restore the sys.stdout file object + codebox("EasyGui Help",text=capturedOutput.getvalue()) + +def _demo_filesavebox(): + filename = "myNewFile.txt" + title = "File SaveAs" + msg ="Save file as:" + + f = filesavebox(msg,title,default=filename) + writeln("You chose to save file: %s" % f) + +def _demo_diropenbox(): + title = "Demo of diropenbox" + msg = "Pick the directory that you wish to open." + d = diropenbox(msg, title) + writeln("You chose directory...: %s" % d) + + d = diropenbox(msg, title,default="./") + writeln("You chose directory...: %s" % d) + + d = diropenbox(msg, title,default="c:/") + writeln("You chose directory...: %s" % d) + + +def _demo_fileopenbox(): + msg = "Python files" + title = "Open files" + default="*.py" + f = fileopenbox(msg,title,default=default) + writeln("You chose to open file: %s" % f) + + default="./*.gif" + filetypes = ["*.jpg",["*.zip","*.tgs","*.gz", "Archive files"],["*.htm", "*.html","HTML files"]] + f = fileopenbox(msg,title,default=default,filetypes=filetypes) + writeln("You chose to open file: %s" % f) + + """#deadcode -- testing ---------------------------------------- + f = fileopenbox(None,None,default=default) + writeln("You chose to open file: %s" % f) + + f = fileopenbox(None,title,default=default) + writeln("You chose to open file: %s" % f) + + f = fileopenbox(msg,None,default=default) + writeln("You chose to open file: %s" % f) + + f = fileopenbox(default=default) + writeln("You chose to open file: %s" % f) + + f = fileopenbox(default=None) + writeln("You chose to open file: %s" % f) + #----------------------------------------------------deadcode """ + + +def _dummy(): + pass + +EASYGUI_ABOUT_INFORMATION = ''' +======================================================================== +0.96(2010-08-29) +======================================================================== +This version fixes some problems with version independence. + +BUG FIXES +------------------------------------------------------ + * A statement with Python 2.x-style exception-handling syntax raised + a syntax error when running under Python 3.x. + Thanks to David Williams for reporting this problem. + + * Under some circumstances, PIL was unable to display non-gif images + that it should have been able to display. + The cause appears to be non-version-independent import syntax. + PIL modules are now imported with a version-independent syntax. + Thanks to Horst Jens for reporting this problem. + +LICENSE CHANGE +------------------------------------------------------ +Starting with this version, EasyGui is licensed under what is generally known as +the "modified BSD license" (aka "revised BSD", "new BSD", "3-clause BSD"). +This license is GPL-compatible but less restrictive than GPL. +Earlier versions were licensed under the Creative Commons Attribution License 2.0. + + +======================================================================== +0.95(2010-06-12) +======================================================================== + +ENHANCEMENTS +------------------------------------------------------ + * Previous versions of EasyGui could display only .gif image files using the + msgbox "image" argument. This version can now display all image-file formats + supported by PIL the Python Imaging Library) if PIL is installed. + If msgbox is asked to open a non-gif image file, it attempts to import + PIL and to use PIL to convert the image file to a displayable format. + If PIL cannot be imported (probably because PIL is not installed) + EasyGui displays an error message saying that PIL must be installed in order + to display the image file. + + Note that + http://www.pythonware.com/products/pil/ + says that PIL doesn't yet support Python 3.x. + + +======================================================================== +0.94(2010-06-06) +======================================================================== + +ENHANCEMENTS +------------------------------------------------------ + * The codebox and textbox functions now return the contents of the box, rather + than simply the name of the button ("Yes"). This makes it possible to use + codebox and textbox as data-entry widgets. A big "thank you!" to Dominic + Comtois for requesting this feature, patiently explaining his requirement, + and helping to discover the tkinter techniques to implement it. + + NOTE THAT in theory this change breaks backward compatibility. But because + (in previous versions of EasyGui) the value returned by codebox and textbox + was meaningless, no application should have been checking it. So in actual + practice, this change should not break backward compatibility. + + * Added support for SPACEBAR to command buttons. Now, when keyboard + focus is on a command button, a press of the SPACEBAR will act like + a press of the ENTER key; it will activate the command button. + + * Added support for keyboard navigation with the arrow keys (up,down,left,right) + to the fields and buttons in enterbox, multenterbox and multpasswordbox, + and to the buttons in choicebox and all buttonboxes. + + * added highlightthickness=2 to entry fields in multenterbox and + multpasswordbox. Now it is easier to tell which entry field has + keyboard focus. + + +BUG FIXES +------------------------------------------------------ + * In EgStore, the pickle file is now opened with "rb" and "wb" rather than + with "r" and "w". This change is necessary for compatibility with Python 3+. + Thanks to Marshall Mattingly for reporting this problem and providing the fix. + + * In integerbox, the actual argument names did not match the names described + in the docstring. Thanks to Daniel Zingaro of at University of Toronto for + reporting this problem. + + * In integerbox, the "argLowerBound" and "argUpperBound" arguments have been + renamed to "lowerbound" and "upperbound" and the docstring has been corrected. + + NOTE THAT THIS CHANGE TO THE ARGUMENT-NAMES BREAKS BACKWARD COMPATIBILITY. + If argLowerBound or argUpperBound are used, an AssertionError with an + explanatory error message is raised. + + * In choicebox, the signature to choicebox incorrectly showed choicebox as + accepting a "buttons" argument. The signature has been fixed. + + +======================================================================== +0.93(2009-07-07) +======================================================================== + +ENHANCEMENTS +------------------------------------------------------ + + * Added exceptionbox to display stack trace of exceptions + + * modified names of some font-related constants to make it + easier to customize them + + +======================================================================== +0.92(2009-06-22) +======================================================================== + +ENHANCEMENTS +------------------------------------------------------ + + * Added EgStore class to to provide basic easy-to-use persistence. + +BUG FIXES +------------------------------------------------------ + + * Fixed a bug that was preventing Linux users from copying text out of + a textbox and a codebox. This was not a problem for Windows users. + +''' + +def abouteasygui(): + """ + shows the easygui revision history + """ + codebox("About EasyGui\n"+egversion,"EasyGui",EASYGUI_ABOUT_INFORMATION) + return None + + + +if __name__ == '__main__': + if True: + egdemo() + else: + # test the new root feature + root = Tk() + msg = """This is a test of a main Tk() window in which we will place an easygui msgbox. + It will be an interesting experiment.\n\n""" + messageWidget = Message(root, text=msg, width=1000) + messageWidget.pack(side=TOP, expand=YES, fill=X, padx='3m', pady='3m') + messageWidget = Message(root, text=msg, width=1000) + messageWidget.pack(side=TOP, expand=YES, fill=X, padx='3m', pady='3m') + + + msgbox("this is a test of passing in boxRoot", root=root) + msgbox("this is a second test of passing in boxRoot", root=root) + + reply = enterbox("Enter something", root=root) + writeln("You wrote:", reply) + + reply = enterbox("Enter something else", root=root) + writeln("You wrote:", reply) + root.destroy() diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/LICENSE.txt remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/LICENSE.txt --- remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/easygui/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,32 @@ +LICENSE INFORMATION + +EasyGui version 0.96 + +Copyright (c) 2010, Stephen Raymond Ferg + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + + 3. The name of the author may not be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/CONTRIBUTORS.txt remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/CONTRIBUTORS.txt --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/CONTRIBUTORS.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/CONTRIBUTORS.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,17 @@ +CONTRIBUTORS for the olefile project +==================================== + +This is a non-exhaustive list of all the people who helped me improve the +olefile project (formerly OleFileIO_PL), in approximative chronological order. +Please contact me if I forgot to mention your name. + +A big thank you to all of them: + +- Niko Ehrenfeuchter: added support for Jython +- Niko Ehrenfeuchter, Martijn Berger and Dave Jones: helped fix 4K sector support +- Martin Panter: conversion to Python 3.x/2.6+ +- mete0r_kr: added support for file-like objects +- chuckleberryfinn: fixed bug in getproperties +- Martijn, Ben G.: bug report for 64 bits platforms +- Philippe Lagadec: main author and maintainer since 2005 +- and of course Fredrik Lundh: original author of OleFileIO from 1995 to 2005 diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,164 @@ + + + + + + + + + +

How to use olefile - API

This page is part of the documentation for olefile. It explains how to use all its features to parse and write OLE files. For more information about OLE files, see OLE_Overview.

olefile can be used as an independent module or with PIL/Pillow. The main functions and methods are explained below.

For more information, see also the file olefile.html, sample code at the end of the module itself, and docstrings within the code.

Import olefile

When the olefile package has been installed, it can be imported in Python applications with this statement:

import olefile

Before v0.40, olefile was named OleFileIO_PL. To maintain backward compatibility with older applications and samples, a simple script is also installed so that the following statement imports olefile as OleFileIO_PL:

import OleFileIO_PL

As of version 0.30, the code has been changed to be compatible with Python 3.x. As a consequence, compatibility with Python 2.5 or older is not provided anymore. However, a copy of OleFileIO_PL v0.26 (with some backported enhancements) is available as olefile2.py. When importing the olefile package, it falls back automatically to olefile2 if running on Python 2.5 or older. This is implemented in olefile/init.py. (new in v0.40)

If you think olefile should stay compatible with Python 2.5 or older, please contact me.

Test if a file is an OLE container

Use isOleFile to check if the first bytes of the file contain the Magic for OLE files, before opening it. isOleFile returns True if it is an OLE file, False otherwise (new in v0.16).

assert olefile.isOleFile('myfile.doc')

The argument of isOleFile can be (new in v0.41):

the path of the file to open on disk (bytes or unicode string smaller than 1536 bytes),
or a bytes string containing the file in memory. (bytes string longer than 1535 bytes),
or a file-like object (with read and seek methods).

Open an OLE file from disk

Create an OleFileIO object with the file path as parameter:

ole = olefile.OleFileIO('myfile.doc')

Open an OLE file from a bytes string

This is useful if the file is already stored in memory as a bytes string.

ole = olefile.OleFileIO(s)

Note: olefile checks the size of the string provided as argument to determine if it is a file path or the content of an OLE file. An OLE file cannot be smaller than 1536 bytes. If the string is larger than 1535 bytes, then it is expected to contain an OLE file, otherwise it is expected to be a file path.

(new in v0.41)

Open an OLE file from a file-like object

This is useful if the file is not on disk but only available as a file-like object (with read, seek and tell methods).

ole = olefile.OleFileIO(f)

If the file-like object does not have seek or tell methods, the easiest solution is to read the file entirely in a bytes string before parsing:

data = f.read()
+ole = olefile.OleFileIO(data)

How to handle malformed OLE files

By default, the parser is configured to be as robust and permissive as possible, allowing to parse most malformed OLE files. Only fatal errors will raise an exception. It is possible to tell the parser to be more strict in order to raise exceptions for files that do not fully conform to the OLE specifications, using the raise_defect option (new in v0.14):

ole = olefile.OleFileIO('myfile.doc', raise_defects=olefile.DEFECT_INCORRECT)

When the parsing is done, the list of non-fatal issues detected is available as a list in the parsing_issues attribute of the OleFileIO object (new in 0.25):

print('Non-fatal issues raised during parsing:')
+if ole.parsing_issues:
+    for exctype, msg in ole.parsing_issues:
+        print('- %s: %s' % (exctype.__name__, msg))
+else:
+    print('None')

Open an OLE file in write mode

Before using the write features, the OLE file must be opened in read/write mode:

ole = olefile.OleFileIO('test.doc', write_mode=True)

(new in v0.40)

The code for write features is new and it has not been thoroughly tested yet. See issue #6 for the roadmap and the implementation status. If you encounter any issue, please send me your feedback or report issues.

Syntax for stream and storage paths

Two different syntaxes are allowed for methods that need or return the path of streams and storages:

Either a list of strings including all the storages from the root up to the stream/storage name. For example a stream called "WordDocument" at the root will have ['WordDocument'] as full path. A stream called "ThisDocument" located in the storage "Macros/VBA" will be ['Macros', 'VBA', 'ThisDocument']. This is the original syntax from PIL. While hard to read and not very convenient, this syntax works in all cases.
Or a single string with slashes to separate storage and stream names (similar to the Unix path syntax). The previous examples would be 'WordDocument' and 'Macros/VBA/ThisDocument'. This syntax is easier, but may fail if a stream or storage name contains a slash (which is normally not allowed, according to the Microsoft specifications [MS-CFB]). (new in v0.15)

Both are case-insensitive.

Switching between the two is easy:

slash_path = '/'.join(list_path)
+list_path  = slash_path.split('/')

Encoding:

Stream and Storage names are stored in Unicode format in OLE files, which means they may contain special characters (e.g. Greek, Cyrillic, Japanese, etc) that applications must support to avoid exceptions.
On Python 2.x, all stream and storage paths are handled by olefile in bytes strings, using the UTF-8 encoding by default. If you need to use Unicode instead, add the option path_encoding=None when creating the OleFileIO object. This is new in v0.42. Olefile was using the Latin-1 encoding until v0.41, therefore special characters were not supported.
On Python 3.x, all stream and storage paths are handled by olefile in unicode strings, without encoding.

Get the list of streams

listdir() returns a list of all the streams contained in the OLE file, including those stored in storages. Each stream is listed itself as a list, as described above.

print(ole.listdir())

Sample result:

[['\x01CompObj'], ['\x05DocumentSummaryInformation'], ['\x05SummaryInformation']
+, ['1Table'], ['Macros', 'PROJECT'], ['Macros', 'PROJECTwm'], ['Macros', 'VBA',
+'Module1'], ['Macros', 'VBA', 'ThisDocument'], ['Macros', 'VBA', '_VBA_PROJECT']
+, ['Macros', 'VBA', 'dir'], ['ObjectPool'], ['WordDocument']]

As an option it is possible to choose if storages should also be listed, with or without streams (new in v0.26):

ole.listdir (streams=False, storages=True)

Test if known streams/storages exist:

exists(path) checks if a given stream or storage exists in the OLE file (new in v0.16). The provided path is case-insensitive.

if ole.exists('worddocument'):
+    print("This is a Word document.")
+    if ole.exists('macros/vba'):
+         print("This document seems to contain VBA macros.")

Read data from a stream

openstream(path) opens a stream as a file-like object. The provided path is case-insensitive.

The following example extracts the "Pictures" stream from a PPT file:

pics = ole.openstream('Pictures')
+data = pics.read()

Get information about a stream/storage

Several methods can provide the size, type and timestamps of a given stream/storage:

get_size(path) returns the size of a stream in bytes (new in v0.16):

s = ole.get_size('WordDocument')

get_type(path) returns the type of a stream/storage, as one of the following constants: STGTY_STREAM for a stream, STGTY_STORAGE for a storage, STGTY_ROOT for the root entry, and False for a non existing path (new in v0.15).

t = ole.get_type('WordDocument')

get_ctime(path) and get_mtime(path) return the creation and modification timestamps of a stream/storage, as a Python datetime object with UTC timezone. Please note that these timestamps are only present if the application that created the OLE file explicitly stored them, which is rarely the case. When not present, these methods return None (new in v0.26).

c = ole.get_ctime('WordDocument')
+m = ole.get_mtime('WordDocument')

The root storage is a special case: You can get its creation and modification timestamps using the OleFileIO.root attribute (new in v0.26):

c = ole.root.getctime()
+m = ole.root.getmtime()

Note: all these methods are case-insensitive.

Overwriting a sector

The write_sect method can overwrite any sector of the file. If the provided data is smaller than the sector size (normally 512 bytes, sometimes 4KB), data is padded with null characters. (new in v0.40)

Here is an example:

ole.write_sect(0x17, b'TEST')

Note: following the MS-CFB specifications, sector 0 is actually the second sector of the file. You may use -1 as index to write the first sector.

Overwriting a stream

The write_stream method can overwrite an existing stream in the file. The new stream data must be the exact same size as the existing one. For now, write_stream can only write streams of 4KB or larger (stored in the main FAT).

For example, you may change text in a MS Word document:

ole = olefile.OleFileIO('test.doc', write_mode=True)
+data = ole.openstream('WordDocument').read()
+data = data.replace(b'foo', b'bar')
+ole.write_stream('WordDocument', data)
+ole.close()

(new in v0.40)

Extract metadata

get_metadata() will check if standard property streams exist, parse all the properties they contain, and return an OleMetadata object with the found properties as attributes (new in v0.24).

meta = ole.get_metadata()
+print('Author:', meta.author)
+print('Title:', meta.title)
+print('Creation date:', meta.create_time)
+# print all metadata:
+meta.dump()

Available attributes include:

codepage, title, subject, author, keywords, comments, template,
+last_saved_by, revision_number, total_edit_time, last_printed, create_time,
+last_saved_time, num_pages, num_words, num_chars, thumbnail,
+creating_application, security, codepage_doc, category, presentation_target,
+bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,
+scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,
+chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,
+version, dig_sig, content_type, content_status, language, doc_version

See the source code of the OleMetadata class for more information.

Parse a property stream

get_properties(path) can be used to parse any property stream that is not handled by get_metadata. It returns a dictionary indexed by integers. Each integer is the index of the property, pointing to its value. For example in the standard property stream '05SummaryInformation', the document title is property #2, and the subject is #3.

p = ole.getproperties('specialprops')

By default as in the original PIL version, timestamp properties are converted into a number of seconds since Jan 1,1601. With the option convert_time, you can obtain more convenient Python datetime objects (UTC timezone). If some time properties should not be converted (such as total editing time in '05SummaryInformation'), the list of indexes can be passed as no_conversion (new in v0.25):

p = ole.getproperties('specialprops', convert_time=True, no_conversion=[10])

Close the OLE file

Unless your application is a simple script that terminates after processing an OLE file, do not forget to close each OleFileIO object after parsing to close the file on disk. (new in v0.22)

ole.close()

Use olefile as a script for testing/debugging

olefile can also be used as a script from the command-line to display the structure of an OLE file and its metadata, for example:

olefile.py myfile.doc

You can use the option -c to check that all streams can be read fully, and -d to generate very verbose debugging information.

olefile documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.md remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.md --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/API.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,313 @@ +How to use olefile - API +======================== + +This page is part of the documentation for [olefile](https://bitbucket.org/decalage/olefileio_pl/wiki). It explains +how to use all its features to parse and write OLE files. For more information about OLE files, see [[OLE_Overview]]. + +olefile can be used as an independent module or with PIL/Pillow. The main functions and methods are explained below. + +For more information, see also the file **olefile.html**, sample code at the end of the module itself, and docstrings within the code. + + + +Import olefile +-------------- + +When the olefile package has been installed, it can be imported in Python applications with this statement: + + :::python + import olefile + +Before v0.40, olefile was named OleFileIO_PL. To maintain backward compatibility with older applications and samples, a +simple script is also installed so that the following statement imports olefile as OleFileIO_PL: + + :::python + import OleFileIO_PL + +As of version 0.30, the code has been changed to be compatible with Python 3.x. As a consequence, compatibility with +Python 2.5 or older is not provided anymore. However, a copy of OleFileIO_PL v0.26 (with some backported enhancements) +is available as olefile2.py. When importing the olefile package, it falls back automatically to olefile2 if running on +Python 2.5 or older. This is implemented in olefile/__init__.py. (new in v0.40) + +If you think olefile should stay compatible with Python 2.5 or older, please [contact me](http://decalage.info/contact). + + +## Test if a file is an OLE container + +Use **isOleFile** to check if the first bytes of the file contain the Magic for OLE files, before opening it. isOleFile +returns True if it is an OLE file, False otherwise (new in v0.16). + + :::python + assert olefile.isOleFile('myfile.doc') + +The argument of isOleFile can be (new in v0.41): + +- the path of the file to open on disk (bytes or unicode string smaller than 1536 bytes), +- or a bytes string containing the file in memory. (bytes string longer than 1535 bytes), +- or a file-like object (with read and seek methods). + +## Open an OLE file from disk + +Create an **OleFileIO** object with the file path as parameter: + + :::python + ole = olefile.OleFileIO('myfile.doc') + +## Open an OLE file from a bytes string + +This is useful if the file is already stored in memory as a bytes string. + + :::python + ole = olefile.OleFileIO(s) + +Note: olefile checks the size of the string provided as argument to determine if it is a file path or the content of an +OLE file. An OLE file cannot be smaller than 1536 bytes. If the string is larger than 1535 bytes, then it is expected to +contain an OLE file, otherwise it is expected to be a file path. + +(new in v0.41) + + +## Open an OLE file from a file-like object + +This is useful if the file is not on disk but only available as a file-like object (with read, seek and tell methods). + + :::python + ole = olefile.OleFileIO(f) + +If the file-like object does not have seek or tell methods, the easiest solution is to read the file entirely in +a bytes string before parsing: + + :::python + data = f.read() + ole = olefile.OleFileIO(data) + + +## How to handle malformed OLE files + +By default, the parser is configured to be as robust and permissive as possible, allowing to parse most malformed OLE files. Only fatal errors will raise an exception. It is possible to tell the parser to be more strict in order to raise exceptions for files that do not fully conform to the OLE specifications, using the raise_defect option (new in v0.14): + + :::python + ole = olefile.OleFileIO('myfile.doc', raise_defects=olefile.DEFECT_INCORRECT) + +When the parsing is done, the list of non-fatal issues detected is available as a list in the parsing_issues attribute of the OleFileIO object (new in 0.25): + + :::python + print('Non-fatal issues raised during parsing:') + if ole.parsing_issues: + for exctype, msg in ole.parsing_issues: + print('- %s: %s' % (exctype.__name__, msg)) + else: + print('None') + + +## Open an OLE file in write mode + +Before using the write features, the OLE file must be opened in read/write mode: + + :::python + ole = olefile.OleFileIO('test.doc', write_mode=True) + +(new in v0.40) + +The code for write features is new and it has not been thoroughly tested yet. See [issue #6](https://bitbucket.org/decalage/olefileio_pl/issue/6/improve-olefileio_pl-to-write-ole-files) for the roadmap and the implementation status. If you encounter any issue, please send me your [feedback](http://www.decalage.info/en/contact) or [report issues](https://bitbucket.org/decalage/olefileio_pl/issues?status=new&status=open). + + +## Syntax for stream and storage paths + +Two different syntaxes are allowed for methods that need or return the path of streams and storages: + +1) Either a **list of strings** including all the storages from the root up to the stream/storage name. For example a +stream called "WordDocument" at the root will have ['WordDocument'] as full path. A stream called "ThisDocument" +located in the storage "Macros/VBA" will be ['Macros', 'VBA', 'ThisDocument']. This is the original syntax from PIL. +While hard to read and not very convenient, this syntax works in all cases. + +2) Or a **single string with slashes** to separate storage and stream names (similar to the Unix path syntax). +The previous examples would be 'WordDocument' and 'Macros/VBA/ThisDocument'. This syntax is easier, but may fail if a +stream or storage name contains a slash (which is normally not allowed, according to the Microsoft specifications [MS-CFB]). (new in v0.15) + +Both are case-insensitive. + +Switching between the two is easy: + + :::python + slash_path = '/'.join(list_path) + list_path = slash_path.split('/') + +**Encoding**: + +- Stream and Storage names are stored in Unicode format in OLE files, which means they may contain special characters + (e.g. Greek, Cyrillic, Japanese, etc) that applications must support to avoid exceptions. +- **On Python 2.x**, all stream and storage paths are handled by olefile in bytes strings, using the **UTF-8 encoding** + by default. If you need to use Unicode instead, add the option **path_encoding=None** when creating the OleFileIO + object. This is new in v0.42. Olefile was using the Latin-1 encoding until v0.41, therefore special characters were + not supported. +- **On Python 3.x**, all stream and storage paths are handled by olefile in unicode strings, without encoding. + +## Get the list of streams + +listdir() returns a list of all the streams contained in the OLE file, including those stored in storages. +Each stream is listed itself as a list, as described above. + + :::python + print(ole.listdir()) + +Sample result: + + :::python + [['\x01CompObj'], ['\x05DocumentSummaryInformation'], ['\x05SummaryInformation'] + , ['1Table'], ['Macros', 'PROJECT'], ['Macros', 'PROJECTwm'], ['Macros', 'VBA', + 'Module1'], ['Macros', 'VBA', 'ThisDocument'], ['Macros', 'VBA', '_VBA_PROJECT'] + , ['Macros', 'VBA', 'dir'], ['ObjectPool'], ['WordDocument']] + +As an option it is possible to choose if storages should also be listed, with or without streams (new in v0.26): + + :::python + ole.listdir (streams=False, storages=True) + + +## Test if known streams/storages exist: + +exists(path) checks if a given stream or storage exists in the OLE file (new in v0.16). The provided path is case-insensitive. + + :::python + if ole.exists('worddocument'): + print("This is a Word document.") + if ole.exists('macros/vba'): + print("This document seems to contain VBA macros.") + + +## Read data from a stream + +openstream(path) opens a stream as a file-like object. The provided path is case-insensitive. + +The following example extracts the "Pictures" stream from a PPT file: + + :::python + pics = ole.openstream('Pictures') + data = pics.read() + + +## Get information about a stream/storage + +Several methods can provide the size, type and timestamps of a given stream/storage: + +get_size(path) returns the size of a stream in bytes (new in v0.16): + + :::python + s = ole.get_size('WordDocument') + +get_type(path) returns the type of a stream/storage, as one of the following constants: STGTY\_STREAM for a stream, STGTY\_STORAGE for a storage, STGTY\_ROOT for the root entry, and False for a non existing path (new in v0.15). + + :::python + t = ole.get_type('WordDocument') + +get\_ctime(path) and get\_mtime(path) return the creation and modification timestamps of a stream/storage, as a Python datetime object with UTC timezone. Please note that these timestamps are only present if the application that created the OLE file explicitly stored them, which is rarely the case. When not present, these methods return None (new in v0.26). + + :::python + c = ole.get_ctime('WordDocument') + m = ole.get_mtime('WordDocument') + +The root storage is a special case: You can get its creation and modification timestamps using the OleFileIO.root attribute (new in v0.26): + + :::python + c = ole.root.getctime() + m = ole.root.getmtime() + +Note: all these methods are case-insensitive. + +## Overwriting a sector + +The write_sect method can overwrite any sector of the file. If the provided data is smaller than the sector size (normally 512 bytes, sometimes 4KB), data is padded with null characters. (new in v0.40) + +Here is an example: + + :::python + ole.write_sect(0x17, b'TEST') + +Note: following the [MS-CFB specifications](http://msdn.microsoft.com/en-us/library/dd942138.aspx), sector 0 is actually the second sector of the file. You may use -1 as index to write the first sector. + + +## Overwriting a stream + +The write_stream method can overwrite an existing stream in the file. The new stream data must be the exact same size as the existing one. For now, write_stream can only write streams of 4KB or larger (stored in the main FAT). + +For example, you may change text in a MS Word document: + + :::python + ole = olefile.OleFileIO('test.doc', write_mode=True) + data = ole.openstream('WordDocument').read() + data = data.replace(b'foo', b'bar') + ole.write_stream('WordDocument', data) + ole.close() + +(new in v0.40) + + + +## Extract metadata + +get_metadata() will check if standard property streams exist, parse all the properties they contain, and return an OleMetadata object with the found properties as attributes (new in v0.24). + + :::python + meta = ole.get_metadata() + print('Author:', meta.author) + print('Title:', meta.title) + print('Creation date:', meta.create_time) + # print all metadata: + meta.dump() + +Available attributes include: + + :::text + codepage, title, subject, author, keywords, comments, template, + last_saved_by, revision_number, total_edit_time, last_printed, create_time, + last_saved_time, num_pages, num_words, num_chars, thumbnail, + creating_application, security, codepage_doc, category, presentation_target, + bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, + scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, + chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, + version, dig_sig, content_type, content_status, language, doc_version + +See the source code of the OleMetadata class for more information. + + +## Parse a property stream + +get\_properties(path) can be used to parse any property stream that is not handled by get\_metadata. It returns a dictionary indexed by integers. Each integer is the index of the property, pointing to its value. For example in the standard property stream '\x05SummaryInformation', the document title is property #2, and the subject is #3. + + :::python + p = ole.getproperties('specialprops') + +By default as in the original PIL version, timestamp properties are converted into a number of seconds since Jan 1,1601. With the option convert\_time, you can obtain more convenient Python datetime objects (UTC timezone). If some time properties should not be converted (such as total editing time in '\x05SummaryInformation'), the list of indexes can be passed as no_conversion (new in v0.25): + + :::python + p = ole.getproperties('specialprops', convert_time=True, no_conversion=[10]) + + +## Close the OLE file + +Unless your application is a simple script that terminates after processing an OLE file, do not forget to close each OleFileIO object after parsing to close the file on disk. (new in v0.22) + + :::python + ole.close() + +## Use olefile as a script for testing/debugging + +olefile can also be used as a script from the command-line to display the structure of an OLE file and its metadata, for example: + + :::text + olefile.py myfile.doc + +You can use the option -c to check that all streams can be read fully, and -d to generate very verbose debugging information. + +-------------------------------------------------------------------------- + +olefile documentation +--------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,28 @@ + + + + + + + + + +

How to Suggest Improvements, Report Issues or Contribute

This is a personal open-source project, developed on my spare time. Any contribution, suggestion, feedback or bug report is welcome.

To suggest improvements, report a bug or any issue, please use the issue reporting page, providing all the information and files to reproduce the problem.

If possible please join the debugging output of olefile. For this, launch the following command :

    olefile.py -d -c file >debug.txt

You may also contact the author directly to provide feedback.

The code is available in a Mercurial repository on Bitbucket. You may use it to submit enhancements using forks and pull requests.

olefile documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.md remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.md --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Contribute.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,28 @@ +How to Suggest Improvements, Report Issues or Contribute +======================================================== + +This is a personal open-source project, developed on my spare time. Any contribution, suggestion, feedback or bug report is welcome. + +To **suggest improvements, report a bug or any issue**, please use the [issue reporting page](https://bitbucket.org/decalage/olefileio_pl/issues?status=new&status=open), providing all the information and files to reproduce the problem. + +If possible please join the debugging output of olefile. For this, launch the following command : + + :::text + olefile.py -d -c file >debug.txt + + +You may also [contact the author](http://decalage.info/contact) directly to **provide feedback**. + +The code is available in [a Mercurial repository on Bitbucket](https://bitbucket.org/decalage/olefileio_pl). You may use it to **submit enhancements** using forks and pull requests. + +-------------------------------------------------------------------------- + +olefile documentation +--------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,62 @@ + + + + + + + + + +

olefile v0.42 documentation

This is the home page of the documentation for olefile. The latest version can be found online, otherwise a copy is provided in the doc subfolder of the package.

olefile is a Python package to parse, read and write Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), such as Microsoft Office 97-2003 documents, Image Composer and FlashPix files, Outlook messages, StickyNotes, several Microscopy file formats, McAfee antivirus quarantine files, etc.

Quick links: Home page - Download/Install - Documentation - Report Issues/Suggestions/Questions - Contact the author - Repository - Updates on Twitter

Documentation pages

License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

Features

Parse, read and write any OLE file such as Microsoft Office 97-2003 legacy document formats (Word .doc, Excel .xls, PowerPoint .ppt, Visio .vsd, Project .mpp), Image Composer and FlashPix files, Outlook messages, StickyNotes, Zeiss AxioVision ZVI files, Olympus FluoView OIB files, etc
List all the streams and storages contained in an OLE file
Open streams as files
Parse and read property streams, containing metadata of the file
Portable, pure Python module, no dependency

olefile can be used as an independent module or with PIL/Pillow.

olefile is mostly meant for developers. If you are looking for tools to analyze OLE files or to extract data (especially for security purposes such as malware analysis and forensics), then please also check my python-oletools, which are built upon olefile and provide a higher-level interface.

History

olefile is based on the OleFileIO module from PIL, the excellent Python Imaging Library, created and maintained by Fredrik Lundh. The olefile API is still compatible with PIL, but since 2005 I have improved the internal implementation significantly, with new features, bugfixes and a more robust design. From 2005 to 2014 the project was called OleFileIO_PL, and in 2014 I changed its name to olefile to celebrate its 9 years and its new write features.

As far as I know, this module is the most complete and robust Python implementation to read MS OLE2 files, portable on several operating systems. (please tell me if you know other similar Python modules)

Since 2014 olefile/OleFileIO_PL has been integrated into Pillow, the friendly fork of PIL. olefile will continue to be improved as a separate project, and new versions will be merged into Pillow regularly.

Main improvements over the original version of OleFileIO in PIL:

Compatible with Python 3.x and 2.6+
Many bug fixes
Support for files larger than 6.8MB
Support for 64 bits platforms and big-endian CPUs
Robust: many checks to detect malformed files
Runtime option to choose if malformed files should be parsed or raise exceptions
Improved API
Metadata extraction, stream/storage timestamps (e.g. for document forensics)
Can open file-like objects
Added setup.py and install.bat to ease installation
More convenient slash-based syntax for stream paths
Write features

olefile documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.md remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.md --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Home.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,94 @@ +olefile v0.42 documentation +=========================== + +This is the home page of the documentation for olefile. The latest version can be found +[online](https://bitbucket.org/decalage/olefileio_pl/wiki), otherwise a copy is provided in the doc subfolder of the package. + +[olefile](http://www.decalage.info/olefile) is a Python package to parse, read and write +[Microsoft OLE2 files](http://en.wikipedia.org/wiki/Compound_File_Binary_Format) +(also called Structured Storage, Compound File Binary Format or Compound Document File Format), such as Microsoft +Office 97-2003 documents, Image Composer and FlashPix files, Outlook messages, StickyNotes, several Microscopy file +formats, McAfee antivirus quarantine files, etc. + + +**Quick links:** +[Home page](http://www.decalage.info/olefile) - +[Download/Install](https://bitbucket.org/decalage/olefileio_pl/wiki/Install) - +[Documentation](https://bitbucket.org/decalage/olefileio_pl/wiki) - +[Report Issues/Suggestions/Questions](https://bitbucket.org/decalage/olefileio_pl/issues?status=new&status=open) - +[Contact the author](http://decalage.info/contact) - +[Repository](https://bitbucket.org/decalage/olefileio_pl) - +[Updates on Twitter](https://twitter.com/decalage2) + +Documentation pages +------------------- + +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage + + +Features +-------- + +- Parse, read and write any OLE file such as Microsoft Office 97-2003 legacy document formats (Word .doc, Excel .xls, + PowerPoint .ppt, Visio .vsd, Project .mpp), Image Composer and FlashPix files, Outlook messages, StickyNotes, Zeiss + AxioVision ZVI files, Olympus FluoView OIB files, etc +- List all the streams and storages contained in an OLE file +- Open streams as files +- Parse and read property streams, containing metadata of the file +- Portable, pure Python module, no dependency + +olefile can be used as an independent module or with PIL/Pillow. + +olefile is mostly meant for developers. If you are looking for tools to analyze OLE files or to extract data +(especially for security purposes such as malware analysis and forensics), then please also check my +[python-oletools](http://www.decalage.info/python/oletools), which are built upon olefile and provide a higher-level +interface. + + +History +------- + +olefile is based on the OleFileIO module from [PIL](http://www.pythonware.com/products/pil/index.htm), the excellent +Python Imaging Library, created and maintained by Fredrik Lundh. The olefile API is still compatible with PIL, but +since 2005 I have improved the internal implementation significantly, with new features, bugfixes and a more robust +design. From 2005 to 2014 the project was called OleFileIO_PL, and in 2014 I changed its name to olefile to celebrate +its 9 years and its new write features. + +As far as I know, this module is the most complete and robust Python implementation to read MS OLE2 files, portable on +several operating systems. (please tell me if you know other similar Python modules) + +Since 2014 olefile/OleFileIO_PL has been integrated into [Pillow](http://python-imaging.github.io/), the friendly fork +of PIL. olefile will continue to be improved as a separate project, and new versions will be merged into Pillow regularly. + +Main improvements over the original version of OleFileIO in PIL: +---------------------------------------------------------------- + +- Compatible with Python 3.x and 2.6+ +- Many bug fixes +- Support for files larger than 6.8MB +- Support for 64 bits platforms and big-endian CPUs +- Robust: many checks to detect malformed files +- Runtime option to choose if malformed files should be parsed or raise exceptions +- Improved API +- Metadata extraction, stream/storage timestamps (e.g. for document forensics) +- Can open file-like objects +- Added setup.py and install.bat to ease installation +- More convenient slash-based syntax for stream paths +- Write features + + +-------------------------------------------------------------------------- + +olefile documentation +--------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,30 @@ + + + + + + + + + +

How to Download and Install olefile

Pre-requisites

olefile requires Python 2.6, 2.7 or 3.x.

For Python 2.5 and older, olefile falls back to an older version (based on OleFileIO_PL 0.26) which might not contain all the enhancements implemented in olefile.

Download and Install

To use olefile with other Python applications or your own scripts, the simplest solution is to run pip install olefile or easy_install olefile, to download and install the package in one go. Pip is part of the standard Python distribution since v2.7.9.

To update olefile if a previous version is already installed, run pip install -U olefile.

Otherwise you may download/extract the zip archive in a temporary directory and run python setup.py install.

On Windows you may simply double-click on install.bat.

olefile documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.md remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.md --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/Install.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,37 @@ +How to Download and Install olefile +=================================== + +Pre-requisites +-------------- + +olefile requires Python 2.6, 2.7 or 3.x. + +For Python 2.5 and older, olefile falls back to an older version (based on OleFileIO_PL 0.26) which might not contain +all the enhancements implemented in olefile. + + +Download and Install +-------------------- + +To use olefile with other Python applications or your own scripts, the simplest solution is to run **pip install olefile** +or **easy_install olefile**, to download and install the package in one go. Pip is part of the standard Python +distribution since v2.7.9. + +To update olefile if a previous version is already installed, run **pip install -U olefile**. + +Otherwise you may download/extract the [zip archive](https://bitbucket.org/decalage/olefileio_pl/downloads) in a +temporary directory and run **python setup.py install**. + +On Windows you may simply double-click on **install.bat**. + +-------------------------------------------------------------------------- + +olefile documentation +--------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,40 @@ + + + + + + + + + +

License for olefile

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

olefile is based on source code from the OleFileIO module of the Python Imaging Library (PIL) published by Fredrik Lundh under the following license:

The Python Imaging Library (PIL) is

By obtaining, using, and/or copying this software and/or its associated documentation, you agree that you have read, understood, and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its associated documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appears in all copies, and that both that copyright notice and this permission notice appear in supporting documentation, and that the name of Secret Labs AB or the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

olefile documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.md remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.md --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/License.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,54 @@ +License for olefile +=================== + +olefile (formerly OleFileIO_PL) is copyright (c) 2005-2015 Philippe Lagadec ([http://www.decalage.info](http://www.decalage.info)) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +---------- + +olefile is based on source code from the OleFileIO module of the Python Imaging Library (PIL) published by Fredrik Lundh under the following license: + +The Python Imaging Library (PIL) is + +- Copyright (c) 1997-2005 by Secret Labs AB +- Copyright (c) 1995-2005 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its associated documentation, you agree that you have read, understood, and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its associated documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appears in all copies, and that both that copyright notice and this permission notice appear in supporting documentation, and that the name of Secret Labs AB or the author not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +-------------------------------------------------------------------------- + +olefile documentation +--------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,31 @@ + + + + + + + + + +

About the structure of OLE files

This page is part of the documentation for olefile. It provides a brief overview of the structure of Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), such as Microsoft Office 97-2003 documents, Image Composer and FlashPix files, Outlook messages, StickyNotes, several Microscopy file formats, McAfee antivirus quarantine files, etc.

An OLE file can be seen as a mini file system or a Zip archive: It contains streams of data that look like files embedded within the OLE file. Each stream has a name. For example, the main stream of a MS Word document containing its text is named "WordDocument".

An OLE file can also contain storages. A storage is a folder that contains streams or other storages. For example, a MS Word document with VBA macros has a storage called "Macros".

Special streams can contain properties. A property is a specific value that can be used to store information such as the metadata of a document (title, author, creation date, etc). Property stream names usually start with the character '05'.

For example, a typical MS Word document may look like this:

Go to the API page to see how to use all olefile features to parse OLE files.

olefile documentation

Home
License
Install
Contribute, Suggest Improvements or Report Issues
OLE_Overview
API and Usage

+ + diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.md remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.md --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.md 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_Overview.md 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,29 @@ +About the structure of OLE files +================================ + +This page is part of the documentation for [olefile](https://bitbucket.org/decalage/olefileio_pl/wiki). It provides a brief overview of the structure of [Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format)](http://en.wikipedia.org/wiki/Compound_File_Binary_Format), such as Microsoft Office 97-2003 documents, Image Composer and FlashPix files, Outlook messages, StickyNotes, several Microscopy file formats, McAfee antivirus quarantine files, etc. + +An OLE file can be seen as a mini file system or a Zip archive: It contains **streams** of data that look like files embedded within the OLE file. Each stream has a name. For example, the main stream of a MS Word document containing its text is named "WordDocument". + +An OLE file can also contain **storages**. A storage is a folder that contains streams or other storages. For example, a MS Word document with VBA macros has a storage called "Macros". + +Special streams can contain **properties**. A property is a specific value that can be used to store information such as the metadata of a document (title, author, creation date, etc). Property stream names usually start with the character '\x05'. + +For example, a typical MS Word document may look like this: + +![](OLE_VBA_sample.png) + +Go to the [[API]] page to see how to use all olefile features to parse OLE files. + + +-------------------------------------------------------------------------- + +olefile documentation +--------------------- + +- [[Home]] +- [[License]] +- [[Install]] +- [[Contribute]], Suggest Improvements or Report Issues +- [[OLE_Overview]] +- [[API]] and Usage Binary files /tmp/tmpjjl7AX/dVIwszLzOn/remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_VBA_sample.png and /tmp/tmpjjl7AX/NgkrW0XO1N/remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/doc/OLE_VBA_sample.png differ diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/__init__.py remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/__init__.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/__init__.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,28 @@ +#!/usr/local/bin/python +# -*- coding: latin-1 -*- +""" +olefile (formerly OleFileIO_PL) + +Module to read/write Microsoft OLE2 files (also called Structured Storage or +Microsoft Compound Document File Format), such as Microsoft Office 97-2003 +documents, Image Composer and FlashPix files, Outlook messages, ... +This version is compatible with Python 2.6+ and 3.x + +Project website: http://www.decalage.info/olefile + +olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info) + +olefile is based on the OleFileIO module from the PIL library v1.1.6 +See: http://www.pythonware.com/products/pil/index.htm + +The Python Imaging Library (PIL) is + Copyright (c) 1997-2005 by Secret Labs AB + Copyright (c) 1995-2005 by Fredrik Lundh + +See source code and LICENSE.txt for information on usage and redistribution. +""" + +# first try to import olefile for Python 2.6+/3.x +from .olefile import * +# import metadata not covered by *: +from .olefile import __version__, __author__, __date__ diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/LICENSE.txt remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/LICENSE.txt --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,56 @@ +LICENSE for the olefile package: + +olefile (formerly OleFileIO_PL) is copyright (c) 2005-2016 Philippe Lagadec +(http://www.decalage.info) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +---------- + +olefile is based on source code from the OleFileIO module of the Python +Imaging Library (PIL) published by Fredrik Lundh under the following license: + +The Python Imaging Library (PIL) is +- Copyright (c) 1997-2005 by Secret Labs AB +- Copyright (c) 1995-2005 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its associated +documentation, you agree that you have read, understood, and will comply with +the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appears in all copies, and that both +that copyright notice and this permission notice appear in supporting +documentation, and that the name of Secret Labs AB or the author not be used +in advertising or publicity pertaining to distribution of the software without +specific, written prior permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN +NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL, +INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,432 @@ + + +Python: module olefile + + + + +

+
olefile (version 0.42, 2015-01-24)

index
.\olefile.py

# olefile (formerly OleFileIO_PL) version 0.42 2015-01-24 +# +# Module to read/write Microsoft OLE2 files (also called Structured Storage or +# Microsoft Compound Document File Format), such as Microsoft Office 97-2003 +# documents, Image Composer and FlashPix files, Outlook messages, ... +# This version is compatible with Python 2.6+ and 3.x +# +# Project website: http://www.decalage.info/olefile +# +# olefile is copyright (c) 2005-2015 Philippe Lagadec (http://www.decalage.info) +# +# olefile is based on the OleFileIO module from the PIL library v1.1.6 +# See: http://www.pythonware.com/products/pil/index.htm +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# See source code and LICENSE.txt for information on usage and redistribution.

+ + + + + +

+Modules

array
+datetime
+ io
+os
+ struct
+sys
+

+ + + + + +

+Classes

+
OleFileIO +
OleMetadata +
+
+ + + + + + + +

+class OleFileIO
    OLE container object + +This class encapsulates the interface to an OLE 2 structured +storage file.  Use the listdir and openstream methods to +access the contents of this file. + +Object names are given as a list of strings, one for each subentry +level.  The root entry should be omitted.  For example, the following +code extracts all image streams from a Microsoft Image Composer file:: + +    ole = OleFileIO("fan.mic") + +    for entry in ole.listdir(): +        if entry[1:2] == "Image": +            fin = ole.openstream(entry) +            fout = open(entry[0:1], "wb") +            while True: +                s = fin.read(8192) +                if not s: +                    break +                fout.write(s) + +You can use the viewer application provided with the Python Imaging +Library to view the resulting files (which happens to be standard +TIFF files).
Methods defined here:
+
__init__(self, filename=None, raise_defects=40, write_mode=False, debug=False, path_encoding='utf-8')
Constructor for the OleFileIO class. + +:param filename: file to open. + +    - if filename is a string smaller than 1536 bytes, it is the path +      of the file to open. (bytes or unicode string) +    - if filename is a string longer than 1535 bytes, it is parsed +      as the content of an OLE file in memory. (bytes type only) +    - if filename is a file-like object (with read, seek and tell methods), +      it is parsed as-is. + +:param raise_defects: minimal level for defects to be raised as exceptions. +    (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a +    security-oriented application, see source code for details) + +:param write_mode: bool, if True the file is opened in read/write mode instead +    of read-only by default. + +:param debug: bool, set debug mode + +:param path_encoding: None or str, name of the codec to use for path +    names (streams and storages), or None for Unicode. +    Unicode by default on Python 3+, UTF-8 on Python 2.x. +    (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41)
+ +
close(self)
close the OLE file, to release the file object
+ +
dumpdirectory(self)
Dump directory (for debugging only)
+ +
dumpfat(self, fat, firstindex=0)
Displays a part of FAT in human-readable form for debugging purpose
+ +
dumpsect(self, sector, firstindex=0)
Displays a sector in a human-readable form, for debugging purpose.
+ +
exists(self, filename)
Test if given filename exists as a stream or a storage in the OLE +container. +Note: filename is case-insensitive. + +:param filename: path of stream in storage tree. (see openstream for syntax) +:returns: True if object exist, else False.
+ +
get_metadata(self)
Parse standard properties streams, return an OleMetadata object +containing all the available metadata. +(also stored in the metadata attribute of the OleFileIO object) + +new in version 0.25
+ +
get_rootentry_name(self)
Return root entry name. Should usually be 'Root Entry' or 'R' in most +implementations.
+ +
get_size(self, filename)
Return size of a stream in the OLE container, in bytes. + +:param filename: path of stream in storage tree (see openstream for syntax) +:returns: size in bytes (long integer) +:exception IOError: if file not found +:exception TypeError: if this is not a stream.
+ +
get_type(self, filename)
Test if given filename exists as a stream or a storage in the OLE +container, and return its type. + +:param filename: path of stream in storage tree. (see openstream for syntax) +:returns: False if object does not exist, its entry type (>0) otherwise: + +    - STGTY_STREAM: a stream +    - STGTY_STORAGE: a storage +    - STGTY_ROOT: the root entry
+ +
getctime(self, filename)
Return creation time of a stream/storage. + +:param filename: path of stream/storage in storage tree. (see openstream for +    syntax) +:returns: None if creation time is null, a python datetime object +    otherwise (UTC timezone) + +new in version 0.26
+ +
getmtime(self, filename)
Return modification time of a stream/storage. + +:param filename: path of stream/storage in storage tree. (see openstream for +    syntax) +:returns: None if modification time is null, a python datetime object +    otherwise (UTC timezone) + +new in version 0.26
+ +
getproperties(self, filename, convert_time=False, no_conversion=None)
Return properties described in substream. + +:param filename: path of stream in storage tree (see openstream for syntax) +:param convert_time: bool, if True timestamps will be converted to Python datetime +:param no_conversion: None or list of int, timestamps not to be converted +    (for example total editing time is not a real timestamp) + +:returns: a dictionary of values indexed by id (integer)
+ +
getsect(self, sect)
Read given sector from file on disk. + +:param sect: int, sector index +:returns: a string containing the sector data.
+ +
listdir(self, streams=True, storages=False)
Return a list of streams and/or storages stored in this file + +:param streams: bool, include streams if True (True by default) - new in v0.26 +:param storages: bool, include storages if True (False by default) - new in v0.26 +    (note: the root storage is never included) +:returns: list of stream and/or storage paths
+ +
loaddirectory(self, sect)
Load the directory. + +:param sect: sector index of directory stream.
+ +
loadfat(self, header)
Load the FAT table.
+ +
loadfat_sect(self, sect)
Adds the indexes of the given sector to the FAT + +:param sect: string containing the first FAT sector, or array of long integers +:returns: index of last FAT sector.
+ +
loadminifat(self)
Load the MiniFAT table.
+ +
open(self, filename, write_mode=False)
Open an OLE2 file in read-only or read/write mode. +Read and parse the header, FAT and directory. + +:param filename: string-like or file-like object, OLE file to parse + +    - if filename is a string smaller than 1536 bytes, it is the path +      of the file to open. (bytes or unicode string) +    - if filename is a string longer than 1535 bytes, it is parsed +      as the content of an OLE file in memory. (bytes type only) +    - if filename is a file-like object (with read, seek and tell methods), +      it is parsed as-is. + +:param write_mode: bool, if True the file is opened in read/write mode instead +    of read-only by default. (ignored if filename is not a path)
+ +
openstream(self, filename)
Open a stream as a read-only file object (BytesIO). +Note: filename is case-insensitive. + +:param filename: path of stream in storage tree (except root entry), either: + +    - a string using Unix path syntax, for example: +      'storage_1/storage_1.2/stream' +    - or a list of storage filenames, path to the desired stream/storage. +      Example: ['storage_1', 'storage_1.2', 'stream'] + +:returns: file object (read-only) +:exception IOError: if filename not found, or if this is not a stream.
+ +
sect2array(self, sect)
convert a sector to an array of 32 bits unsigned integers, +swapping bytes on big endian CPUs such as PowerPC (old Macs)
+ +
write_sect(self, sect, data, padding='\x00')
Write given sector to file on disk. + +:param sect: int, sector index +:param data: bytes, sector data +:param padding: single byte, padding character if data < sector size
+ +
write_stream(self, stream_name, data)
Write a stream to disk. For now, it is only possible to replace an +existing stream by data of the same size. + +:param stream_name: path of stream in storage tree (except root entry), either: + +    - a string using Unix path syntax, for example: +      'storage_1/storage_1.2/stream' +    - or a list of storage filenames, path to the desired stream/storage. +      Example: ['storage_1', 'storage_1.2', 'stream'] + +:param data: bytes, data to be written, must be the same size as the original +    stream.
+ +

+ + + + + + + +

+class OleMetadata
    class to parse and store metadata from standard properties of OLE files. + +Available attributes: +codepage, title, subject, author, keywords, comments, template, +last_saved_by, revision_number, total_edit_time, last_printed, create_time, +last_saved_time, num_pages, num_words, num_chars, thumbnail, +creating_application, security, codepage_doc, category, presentation_target, +bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, +scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, +chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, +version, dig_sig, content_type, content_status, language, doc_version + +Note: an attribute is set to None when not present in the properties of the +OLE file. + +References for SummaryInformation stream: +- http://msdn.microsoft.com/en-us/library/dd942545.aspx +- http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx +- http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx +- http://msdn.microsoft.com/en-us/library/aa372045.aspx +- http://sedna-soft.de/summary-information-stream/ +- http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html + +References for DocumentSummaryInformation stream: +- http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx +- http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx +- http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html + +new in version 0.25
Methods defined here:
+
__init__(self)
Constructor for OleMetadata +All attributes are set to None by default
+ +
dump(self)
Dump all metadata, for debugging purposes.
+ +
parse_properties(self, olefile)
Parse standard properties of an OLE file, from the streams +"SummaryInformation" and "DocumentSummaryInformation", +if present. +Properties are converted to strings, integers or python datetime objects. +If a property is not present, its value is set to None.
+ +
+Data and other attributes defined here:
+
DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', 'slides', 'notes', 'hidden_slides', 'mm_clips', 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', 'link_base', ...]
+ +
SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', 'template', 'last_saved_by', 'revision_number', 'total_edit_time', 'last_printed', 'create_time', 'last_saved_time', 'num_pages', 'num_words', 'num_chars', 'thumbnail', 'creating_application', 'security']
+ +

+ + + + + +

+Functions

debug = debug_pass(msg)
+
debug_pass(msg)
+
debug_print(msg)
+
filetime2datetime(filetime)
convert FILETIME (64 bits int) to Python datetime.datetime
+
i16(c, o=0)
Converts a 2-bytes (16 bits) string to an integer. + +:param c: string containing bytes to convert +:param o: offset of bytes to convert in string
+
i32(c, o=0)
Converts a 4-bytes (32 bits) string to an integer. + +:param c: string containing bytes to convert +:param o: offset of bytes to convert in string
+
i8(c)
# version for Python 2.x
+
isOleFile(filename)
Test if a file is an OLE container (according to the magic bytes in its header). + +:param filename: string-like or file-like object, OLE file to parse + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read and seek methods), + it is parsed as-is. + +:returns: True if OLE, False otherwise.
+
set_debug_mode(debug_mode)
Set debug mode on or off, to control display of debugging messages. +:param mode: True or False
+

+ + + + + +

+Data
DEBUG_MODE = False
+DEFAULT_PATH_ENCODING = 'utf-8'
+DEFECT_FATAL = 40
+DEFECT_INCORRECT = 30
+DEFECT_POTENTIAL = 20
+DEFECT_UNSURE = 10
+DIFSECT = 4294967292L
+ENDOFCHAIN = 4294967294L
+FATSECT = 4294967293L
+FREESECT = 4294967295L
+KEEP_UNICODE_NAMES = True
+MAGIC = '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
+MAXREGSECT = 4294967290L
+MAXREGSID = 4294967290L
+MINIMAL_OLEFILE_SIZE = 1536
+NOSTREAM = 4294967295L
+STGTY_EMPTY = 0
+STGTY_LOCKBYTES = 3
+STGTY_PROPERTY = 4
+STGTY_ROOT = 5
+STGTY_STORAGE = 1
+STGTY_STREAM = 2
+UINT32 = 'L'
+VT = {0: 'VT_EMPTY', 1: 'VT_NULL', 2: 'VT_I2', 3: 'VT_I4', 4: 'VT_R4', 5: 'VT_R8', 6: 'VT_CY', 7: 'VT_DATE', 8: 'VT_BSTR', 9: 'VT_DISPATCH', ...}
+VT_BLOB = 65
+VT_BLOB_OBJECT = 70
+VT_BOOL = 11
+VT_BSTR = 8
+VT_CARRAY = 28
+VT_CF = 71
+VT_CLSID = 72
+VT_CY = 6
+VT_DATE = 7
+VT_DECIMAL = 14
+VT_DISPATCH = 9
+VT_EMPTY = 0
+VT_ERROR = 10
+VT_FILETIME = 64
+VT_HRESULT = 25
+VT_I1 = 16
+VT_I2 = 2
+VT_I4 = 3
+VT_I8 = 20
+VT_INT = 22
+VT_LPSTR = 30
+VT_LPWSTR = 31
+VT_NULL = 1
+VT_PTR = 26
+VT_R4 = 4
+VT_R8 = 5
+VT_SAFEARRAY = 27
+VT_STORAGE = 67
+VT_STORED_OBJECT = 69
+VT_STREAM = 66
+VT_STREAMED_OBJECT = 68
+VT_UI1 = 17
+VT_UI2 = 18
+VT_UI4 = 19
+VT_UI8 = 21
+VT_UINT = 23
+VT_UNKNOWN = 13
+VT_USERDEFINED = 29
+VT_VARIANT = 12
+VT_VECTOR = 4096
+VT_VOID = 24
+WORD_CLSID = '00020900-0000-0000-C000-000000000046'
+__author__ = 'Philippe Lagadec'
+__date__ = '2015-01-24'
+__version__ = '0.42'
+keyword = 'VT_UNKNOWN'
+print_function = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 65536)
+var = 13

+ + + + + +

+Author
Philippe Lagadec
+ \ No newline at end of file diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.py remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/olefile.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,2448 @@ +#!/usr/bin/env python + +# olefile (formerly OleFileIO_PL) +# +# Module to read/write Microsoft OLE2 files (also called Structured Storage or +# Microsoft Compound Document File Format), such as Microsoft Office 97-2003 +# documents, Image Composer and FlashPix files, Outlook messages, ... +# This version is compatible with Python 2.6+ and 3.x +# +# Project website: http://www.decalage.info/olefile +# +# olefile is copyright (c) 2005-2016 Philippe Lagadec (http://www.decalage.info) +# +# olefile is based on the OleFileIO module from the PIL library v1.1.6 +# See: http://www.pythonware.com/products/pil/index.htm +# +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# See source code and LICENSE.txt for information on usage and redistribution. + + +# Since OleFileIO_PL v0.30, only Python 2.6+ and 3.x is supported +# This import enables print() as a function rather than a keyword +# (main requirement to be compatible with Python 3.x) +# The comment on the line below should be printed on Python 2.5 or older: +from __future__ import print_function # This version of olefile requires Python 2.6+ or 3.x. + + +__author__ = "Philippe Lagadec" +__date__ = "2016-04-26" +__version__ = '0.44' + +#--- LICENSE ------------------------------------------------------------------ + +# olefile (formerly OleFileIO_PL) is copyright (c) 2005-2016 Philippe Lagadec +# (http://www.decalage.info) +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ---------- +# PIL License: +# +# olefile is based on source code from the OleFileIO module of the Python +# Imaging Library (PIL) published by Fredrik Lundh under the following license: + +# The Python Imaging Library (PIL) is +# Copyright (c) 1997-2005 by Secret Labs AB +# Copyright (c) 1995-2005 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its associated +# documentation, you agree that you have read, understood, and will comply with +# the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and its +# associated documentation for any purpose and without fee is hereby granted, +# provided that the above copyright notice appears in all copies, and that both +# that copyright notice and this permission notice appear in supporting +# documentation, and that the name of Secret Labs AB or the author(s) not be used +# in advertising or publicity pertaining to distribution of the software +# without specific, written prior permission. +# +# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS +# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. +# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL, +# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + +#----------------------------------------------------------------------------- +# CHANGELOG: (only olefile/OleFileIO_PL changes compared to PIL 1.1.6) +# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility +# (all changes flagged with [PL]) +# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise +# exceptions in OleStream.__init__() +# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat) +# - added some constants +# - added header values checks +# - added some docstrings +# - getsect: bugfix in case sectors >512 bytes +# - getsect: added conformity checks +# - DEBUG_MODE constant to activate debug display +# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments +# - updated license +# - converted tabs to 4 spaces +# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity +# - improved _unicode() to use Python 2.x unicode support +# - fixed bug in OleDirectoryEntry +# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops +# - fixed OleStream which didn't check stream size +# - added/improved many docstrings and comments +# - moved helper functions _unicode and _clsid out of +# OleFileIO class +# - improved OleFileIO._find() to add Unix path syntax +# - OleFileIO._find() is now case-insensitive +# - added get_type() and get_rootentry_name() +# - rewritten loaddirectory and OleDirectoryEntry +# 2007-11-27 v0.16 PL: - added OleDirectoryEntry.kids_dict +# - added detection of duplicate filenames in storages +# - added detection of duplicate references to streams +# - added get_size() and exists() to OleDirectoryEntry +# - added isOleFile to check header before parsing +# - added __all__ list to control public keywords in pydoc +# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory +# - improved _unicode(), added workarounds for Python <2.3 +# - added set_debug_mode and -d option to set debug mode +# - fixed bugs in OleFileIO.open and OleDirectoryEntry +# - added safety check in main for large or binary +# properties +# - allow size>0 for storages for some implementations +# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and +# streams +# - added option '-c' in main to check all streams +# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms +# (thanks to Ben G. and Martijn for reporting the bug) +# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str +# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs +# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn +# (https://bitbucket.org/decalage/olefileio_pl/issue/7) +# - added close method to OleFileIO (fixed issue #2) +# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr) +# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python +# datetime +# - main: displays properties with date format +# - new class OleMetadata to parse standard properties +# - added get_metadata method +# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata +# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps +# - OleMetaData: total_edit_time is now a number of seconds, +# not a timestamp +# - getproperties: added support for VT_BOOL, VT_INT, V_UINT +# - getproperties: filter out null chars from strings +# - getproperties: raise non-fatal defects instead of +# exceptions when properties cannot be parsed properly +# 2013-05-27 PL: - getproperties: improved exception handling +# - _raise_defect: added option to set exception type +# - all non-fatal issues are now recorded, and displayed +# when run as a script +# 2013-07-11 v0.26 PL: - added methods to get modification and creation times +# of a directory entry or a storage/stream +# - fixed parsing of direntry timestamps +# 2013-07-24 PL: - new options in listdir to list storages and/or streams +# 2014-02-04 v0.30 PL: - upgraded code to support Python 3.x by Martin Panter +# - several fixes for Python 2.6 (xrange, MAGIC) +# - reused i32 from Pillow's _binary +# 2014-07-18 v0.31 - preliminary support for 4K sectors +# 2014-07-27 v0.31 PL: - a few improvements in OleFileIO.open (header parsing) +# - Fixed loadfat for large files with 4K sectors (issue #3) +# 2014-07-30 v0.32 PL: - added write_sect to write sectors to disk +# - added write_mode option to OleFileIO.__init__ and open +# 2014-07-31 PL: - fixed padding in write_sect for Python 3, added checks +# - added write_stream to write a stream to disk +# 2014-09-26 v0.40 PL: - renamed OleFileIO_PL to olefile +# 2014-11-09 NE: - added support for Jython (Niko Ehrenfeuchter) +# 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE +# data in a string buffer and file-like objects. +# 2014-11-21 PL: - updated comments according to Pillow's commits +# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1 +# to UTF-8 on Python 2.x (Unicode on Python 3.x) +# - added path_encoding option to override the default +# - fixed a bug in _list when a storage is empty +# 2015-04-17 v0.43 PL: - slight changes in OleDirectoryEntry +# 2015-10-19 - fixed issue #26 in OleFileIO.getproperties +# (using id and type as local variable names) +# 2015-10-29 - replaced debug() with proper logging +# - use optparse to handle command line options +# - improved attribute names in OleFileIO class +# 2015-11-05 - fixed issue #27 by correcting the MiniFAT sector +# cutoff size if invalid. +# 2016-02-02 - logging is disabled by default +# 2016-04-26 v0.44 PL: - added enable_logging +# - renamed _OleDirectoryEntry and _OleStream without '_' +# - in OleStream use _raise_defect instead of exceptions +# 2016-04-27 - added support for incomplete streams and incorrect +# directory entries (to read malformed documents) +# 2016-05-04 - fixed slight bug in OleStream + +#----------------------------------------------------------------------------- +# TODO (for version 1.0): +# + get rid of print statements, to simplify Python 2.x and 3.x support +# + add is_stream and is_storage +# + remove leading and trailing slashes where a path is used +# + add functions path_list2str and path_str2list +# + fix how all the methods handle unicode str and/or bytes as arguments +# + add path attrib to _OleDirEntry, set it once and for all in init or +# append_kids (then listdir/_list can be simplified) +# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ... +# - add underscore to each private method, to avoid their display in +# pydoc/epydoc documentation - Remove it for classes to be documented +# - replace all raised exceptions with _raise_defect (at least in OleFileIO) +# - merge code from OleStream and OleFileIO.getsect to read sectors +# (maybe add a class for FAT and MiniFAT ?) +# - add method to check all streams (follow sectors chains without storing all +# stream in memory, and report anomalies) +# - use OleDirectoryEntry.kids_dict to improve _find and _list ? +# - fix Unicode names handling (find some way to stay compatible with Py1.5.2) +# => if possible avoid converting names to Latin-1 +# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop) +# - rewrite OleFileIO.getproperties +# - improve docstrings to show more sample uses +# - see also original notes and FIXME below +# - remove all obsolete FIXMEs +# - OleMetadata: fix version attrib according to +# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx + +# IDEAS: +# - in OleFileIO._open and OleStream, use size=None instead of 0x7FFFFFFF for +# streams with unknown size +# - use arrays of int instead of long integers for FAT/MiniFAT, to improve +# performance and reduce memory usage ? (possible issue with values >2^31) +# - provide tests with unittest (may need write support to create samples) +# - move all debug code (and maybe dump methods) to a separate module, with +# a class which inherits OleFileIO ? +# - fix docstrings to follow epydoc format +# - add support for big endian byte order ? +# - create a simple OLE explorer with wxPython + +# FUTURE EVOLUTIONS to add write support: +# see issue #6 on Bitbucket: +# https://bitbucket.org/decalage/olefileio_pl/issue/6/improve-olefileio_pl-to-write-ole-files + +#----------------------------------------------------------------------------- +# NOTES from PIL 1.1.6: + +# History: +# 1997-01-20 fl Created +# 1997-01-22 fl Fixed 64-bit portability quirk +# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle) +# 2004-02-29 fl Changed long hex constants to signed integers +# +# Notes: +# FIXME: sort out sign problem (eliminate long hex constants) +# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"] +# FIXME: provide a glob mechanism function (using fnmatchcase) +# +# Literature: +# +# "FlashPix Format Specification, Appendix A", Kodak and Microsoft, +# September 1996. +# +# Quotes: +# +# "If this document and functionality of the Software conflict, +# the actual functionality of the Software represents the correct +# functionality" -- Microsoft, in the OLE format specification + +#------------------------------------------------------------------------------ + + +import io +import sys +import struct, array, os.path, datetime, logging + +#=== COMPATIBILITY WORKAROUNDS ================================================ + +#[PL] Define explicitly the public API to avoid private objects in pydoc: +#TODO: add more +# __all__ = ['OleFileIO', 'isOleFile', 'MAGIC'] + +# For Python 3.x, need to redefine long as int: +if str is not bytes: + long = int + +# Need to make sure we use xrange both on Python 2 and 3.x: +try: + # on Python 2 we need xrange: + iterrange = xrange +except: + # no xrange, for Python 3 it was renamed as range: + iterrange = range + +#[PL] workaround to fix an issue with array item size on 64 bits systems: +if array.array('L').itemsize == 4: + # on 32 bits platforms, long integers in an array are 32 bits: + UINT32 = 'L' +elif array.array('I').itemsize == 4: + # on 64 bits platforms, integers in an array are 32 bits: + UINT32 = 'I' +elif array.array('i').itemsize == 4: + # On 64 bit Jython, signed integers ('i') are the only way to store our 32 + # bit values in an array in a *somewhat* reasonable way, as the otherwise + # perfectly suited 'H' (unsigned int, 32 bits) results in a completely + # unusable behaviour. This is most likely caused by the fact that Java + # doesn't have unsigned values, and thus Jython's "array" implementation, + # which is based on "jarray", doesn't have them either. + # NOTE: to trick Jython into converting the values it would normally + # interpret as "signed" into "unsigned", a binary-and operation with + # 0xFFFFFFFF can be used. This way it is possible to use the same comparing + # operations on all platforms / implementations. The corresponding code + # lines are flagged with a 'JYTHON-WORKAROUND' tag below. + UINT32 = 'i' +else: + raise ValueError('Need to fix a bug with 32 bit arrays, please contact author...') + + +#[PL] These workarounds were inspired from the Path module +# (see http://www.jorendorff.com/articles/python/path/) +#TODO: test with old Python versions + +# Pre-2.3 workaround for basestring. +try: + basestring +except NameError: + try: + # is Unicode supported (Python >2.0 or >1.6 ?) + basestring = (str, unicode) + except NameError: + basestring = str + +#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode +# if False (default PIL behaviour), all filenames are converted to Latin-1. +KEEP_UNICODE_NAMES = True + +if sys.version_info[0] < 3: + # On Python 2.x, the default encoding for path names is UTF-8: + DEFAULT_PATH_ENCODING = 'utf-8' +else: + # On Python 3.x, the default encoding for path names is Unicode (None): + DEFAULT_PATH_ENCODING = None + + +# === LOGGING ================================================================= + +class NullHandler(logging.Handler): + """ + Log Handler without output, to avoid printing messages if logging is not + configured by the main application. + Python 2.7 has logging.NullHandler, but this is necessary for 2.6: + see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library + """ + def emit(self, record): + pass + +def get_logger(name, level=logging.CRITICAL+1): + """ + Create a suitable logger object for this module. + The goal is not to change settings of the root logger, to avoid getting + other modules' logs on the screen. + If a logger exists with same name, reuse it. (Else it would have duplicate + handlers and messages would be doubled.) + The level is set to CRITICAL+1 by default, to avoid any logging. + """ + # First, test if there is already a logger with the same name, else it + # will generate duplicate messages (due to duplicate handlers): + if name in logging.Logger.manager.loggerDict: + #NOTE: another less intrusive but more "hackish" solution would be to + # use getLogger then test if its effective level is not default. + logger = logging.getLogger(name) + # make sure level is OK: + logger.setLevel(level) + return logger + # get a new logger: + logger = logging.getLogger(name) + # only add a NullHandler for this logger, it is up to the application + # to configure its own logging: + logger.addHandler(NullHandler()) + logger.setLevel(level) + return logger + + +# a global logger object used for debugging: +log = get_logger('olefile') + + +def enable_logging(): + """ + Enable logging for this module (disabled by default). + This will set the module-specific logger level to NOTSET, which + means the main application controls the actual logging level. + """ + log.setLevel(logging.NOTSET) + + +#=== CONSTANTS =============================================================== + +# magic bytes that should be at the beginning of every OLE file: +MAGIC = b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' + +#[PL]: added constants for Sector IDs (from AAF specifications) +MAXREGSECT = 0xFFFFFFFA # (-6) maximum SECT +DIFSECT = 0xFFFFFFFC # (-4) denotes a DIFAT sector in a FAT +FATSECT = 0xFFFFFFFD # (-3) denotes a FAT sector in a FAT +ENDOFCHAIN = 0xFFFFFFFE # (-2) end of a virtual stream chain +FREESECT = 0xFFFFFFFF # (-1) unallocated sector + +#[PL]: added constants for Directory Entry IDs (from AAF specifications) +MAXREGSID = 0xFFFFFFFA # (-6) maximum directory entry ID +NOSTREAM = 0xFFFFFFFF # (-1) unallocated directory entry + +#[PL] object types in storage (from AAF specifications) +STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc) +STGTY_STORAGE = 1 # element is a storage object +STGTY_STREAM = 2 # element is a stream object +STGTY_LOCKBYTES = 3 # element is an ILockBytes object +STGTY_PROPERTY = 4 # element is an IPropertyStorage object +STGTY_ROOT = 5 # element is a root storage + +# Unknown size for a stream (used by OleStream): +UNKNOWN_SIZE = 0x7FFFFFFF + +# +# -------------------------------------------------------------------- +# property types + +VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6; +VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11; +VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17; +VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23; +VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28; +VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64; +VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68; +VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72; +VT_VECTOR=0x1000; + +# map property id to name (for debugging purposes) + +VT = {} +for keyword, var in list(vars().items()): + if keyword[:3] == "VT_": + VT[var] = keyword + +# +# -------------------------------------------------------------------- +# Some common document types (root.clsid fields) + +WORD_CLSID = "00020900-0000-0000-C000-000000000046" +#TODO: check Excel, PPT, ... + +#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect() +DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect +DEFECT_POTENTIAL = 20 # a potential defect +DEFECT_INCORRECT = 30 # an error according to specifications, but parsing + # can go on +DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is + # impossible + +# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes +# (this is used in isOleFile and OleFile.open) +MINIMAL_OLEFILE_SIZE = 1536 + +#[PL] add useful constants to __all__: +# for key in list(vars().keys()): +# if key.startswith('STGTY_') or key.startswith('DEFECT_'): +# __all__.append(key) + + +#=== FUNCTIONS =============================================================== + +def isOleFile (filename): + """ + Test if a file is an OLE container (according to the magic bytes in its header). + + :param filename: string-like or file-like object, OLE file to parse + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read and seek methods), + it is parsed as-is. + + :returns: True if OLE, False otherwise. + """ + # check if filename is a string-like or file-like object: + if hasattr(filename, 'read'): + # file-like object: use it directly + header = filename.read(len(MAGIC)) + # just in case, seek back to start of file: + filename.seek(0) + elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: + # filename is a bytes string containing the OLE file to be parsed: + header = filename[:len(MAGIC)] + else: + # string-like object: filename of file on disk + header = open(filename, 'rb').read(len(MAGIC)) + if header == MAGIC: + return True + else: + return False + + +if bytes is str: + # version for Python 2.x + def i8(c): + return ord(c) +else: + # version for Python 3.x + def i8(c): + return c if c.__class__ is int else c[0] + + +#TODO: replace i16 and i32 with more readable struct.unpack equivalent? + +def i16(c, o = 0): + """ + Converts a 2-bytes (16 bits) string to an integer. + + :param c: string containing bytes to convert + :param o: offset of bytes to convert in string + """ + return i8(c[o]) | (i8(c[o+1])<<8) + + +def i32(c, o = 0): + """ + Converts a 4-bytes (32 bits) string to an integer. + + :param c: string containing bytes to convert + :param o: offset of bytes to convert in string + """ +## return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24)) +## # [PL]: added int() because "<<" gives long int since Python 2.4 + # copied from Pillow's _binary: + return i8(c[o]) | (i8(c[o+1])<<8) | (i8(c[o+2])<<16) | (i8(c[o+3])<<24) + + +def _clsid(clsid): + """ + Converts a CLSID to a human-readable string. + + :param clsid: string of length 16. + """ + assert len(clsid) == 16 + # if clsid is only made of null bytes, return an empty string: + # (PL: why not simply return the string with zeroes?) + if not clsid.strip(b"\0"): + return "" + return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) % + ((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) + + tuple(map(i8, clsid[8:16])))) + + + +def filetime2datetime(filetime): + """ + convert FILETIME (64 bits int) to Python datetime.datetime + """ + # TODO: manage exception when microseconds is too large + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + #log.debug('timedelta days=%d' % (filetime//(10*1000000*3600*24))) + return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10) + + + +#=== CLASSES ================================================================== + +class OleMetadata: + """ + class to parse and store metadata from standard properties of OLE files. + + Available attributes: + codepage, title, subject, author, keywords, comments, template, + last_saved_by, revision_number, total_edit_time, last_printed, create_time, + last_saved_time, num_pages, num_words, num_chars, thumbnail, + creating_application, security, codepage_doc, category, presentation_target, + bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips, + scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty, + chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed, + version, dig_sig, content_type, content_status, language, doc_version + + Note: an attribute is set to None when not present in the properties of the + OLE file. + + References for SummaryInformation stream: + - http://msdn.microsoft.com/en-us/library/dd942545.aspx + - http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380376%28v=vs.85%29.aspx + - http://msdn.microsoft.com/en-us/library/aa372045.aspx + - http://sedna-soft.de/summary-information-stream/ + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html + + References for DocumentSummaryInformation stream: + - http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx + - http://msdn.microsoft.com/en-us/library/windows/desktop/aa380374%28v=vs.85%29.aspx + - http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html + + new in version 0.25 + """ + + # attribute names for SummaryInformation stream properties: + # (ordered by property id, starting at 1) + SUMMARY_ATTRIBS = ['codepage', 'title', 'subject', 'author', 'keywords', 'comments', + 'template', 'last_saved_by', 'revision_number', 'total_edit_time', + 'last_printed', 'create_time', 'last_saved_time', 'num_pages', + 'num_words', 'num_chars', 'thumbnail', 'creating_application', + 'security'] + + # attribute names for DocumentSummaryInformation stream properties: + # (ordered by property id, starting at 1) + DOCSUM_ATTRIBS = ['codepage_doc', 'category', 'presentation_target', 'bytes', 'lines', 'paragraphs', + 'slides', 'notes', 'hidden_slides', 'mm_clips', + 'scale_crop', 'heading_pairs', 'titles_of_parts', 'manager', + 'company', 'links_dirty', 'chars_with_spaces', 'unused', 'shared_doc', + 'link_base', 'hlinks', 'hlinks_changed', 'version', 'dig_sig', + 'content_type', 'content_status', 'language', 'doc_version'] + + def __init__(self): + """ + Constructor for OleMetadata + All attributes are set to None by default + """ + # properties from SummaryInformation stream + self.codepage = None + self.title = None + self.subject = None + self.author = None + self.keywords = None + self.comments = None + self.template = None + self.last_saved_by = None + self.revision_number = None + self.total_edit_time = None + self.last_printed = None + self.create_time = None + self.last_saved_time = None + self.num_pages = None + self.num_words = None + self.num_chars = None + self.thumbnail = None + self.creating_application = None + self.security = None + # properties from DocumentSummaryInformation stream + self.codepage_doc = None + self.category = None + self.presentation_target = None + self.bytes = None + self.lines = None + self.paragraphs = None + self.slides = None + self.notes = None + self.hidden_slides = None + self.mm_clips = None + self.scale_crop = None + self.heading_pairs = None + self.titles_of_parts = None + self.manager = None + self.company = None + self.links_dirty = None + self.chars_with_spaces = None + self.unused = None + self.shared_doc = None + self.link_base = None + self.hlinks = None + self.hlinks_changed = None + self.version = None + self.dig_sig = None + self.content_type = None + self.content_status = None + self.language = None + self.doc_version = None + + + def parse_properties(self, olefile): + """ + Parse standard properties of an OLE file, from the streams + "\x05SummaryInformation" and "\x05DocumentSummaryInformation", + if present. + Properties are converted to strings, integers or python datetime objects. + If a property is not present, its value is set to None. + """ + # first set all attributes to None: + for attrib in (self.SUMMARY_ATTRIBS + self.DOCSUM_ATTRIBS): + setattr(self, attrib, None) + if olefile.exists("\x05SummaryInformation"): + # get properties from the stream: + # (converting timestamps to python datetime, except total_edit_time, + # which is property #10) + props = olefile.getproperties("\x05SummaryInformation", + convert_time=True, no_conversion=[10]) + # store them into this object's attributes: + for i in range(len(self.SUMMARY_ATTRIBS)): + # ids for standards properties start at 0x01, until 0x13 + value = props.get(i+1, None) + setattr(self, self.SUMMARY_ATTRIBS[i], value) + if olefile.exists("\x05DocumentSummaryInformation"): + # get properties from the stream: + props = olefile.getproperties("\x05DocumentSummaryInformation", + convert_time=True) + # store them into this object's attributes: + for i in range(len(self.DOCSUM_ATTRIBS)): + # ids for standards properties start at 0x01, until 0x13 + value = props.get(i+1, None) + setattr(self, self.DOCSUM_ATTRIBS[i], value) + + def dump(self): + """ + Dump all metadata, for debugging purposes. + """ + print('Properties from SummaryInformation stream:') + for prop in self.SUMMARY_ATTRIBS: + value = getattr(self, prop) + print('- %s: %s' % (prop, repr(value))) + print('Properties from DocumentSummaryInformation stream:') + for prop in self.DOCSUM_ATTRIBS: + value = getattr(self, prop) + print('- %s: %s' % (prop, repr(value))) + + +#--- OleStream --------------------------------------------------------------- + +class OleStream(io.BytesIO): + """ + OLE2 Stream + + Returns a read-only file object which can be used to read + the contents of a OLE stream (instance of the BytesIO class). + To open a stream, use the openstream method in the OleFile class. + + This function can be used with either ordinary streams, + or ministreams, depending on the offset, sectorsize, and + fat table arguments. + + Attributes: + + - size: actual size of data stream, after it was opened. + """ + # FIXME: should store the list of sects obtained by following + # the fat chain, and load new sectors on demand instead of + # loading it all in one go. + + def __init__(self, fp, sect, size, offset, sectorsize, fat, filesize, olefileio): + """ + Constructor for OleStream class. + + :param fp: file object, the OLE container or the MiniFAT stream + :param sect: sector index of first sector in the stream + :param size: total size of the stream + :param offset: offset in bytes for the first FAT or MiniFAT sector + :param sectorsize: size of one sector + :param fat: array/list of sector indexes (FAT or MiniFAT) + :param filesize: size of OLE file (for debugging) + :param olefileio: OleFileIO object containing this stream + :returns: a BytesIO instance containing the OLE stream + """ + log.debug('OleStream.__init__:') + log.debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s' + %(sect,sect,size,offset,sectorsize,len(fat), repr(fp))) + self.ole = olefileio + #[PL] To detect malformed documents with FAT loops, we compute the + # expected number of sectors in the stream: + unknown_size = False + if size == UNKNOWN_SIZE: + # this is the case when called from OleFileIO._open(), and stream + # size is not known in advance (for example when reading the + # Directory stream). Then we can only guess maximum size: + size = len(fat)*sectorsize + # and we keep a record that size was unknown: + unknown_size = True + log.debug(' stream with UNKNOWN SIZE') + nb_sectors = (size + (sectorsize-1)) // sectorsize + log.debug('nb_sectors = %d' % nb_sectors) + # This number should (at least) be less than the total number of + # sectors in the given FAT: + if nb_sectors > len(fat): + self.ole._raise_defect(DEFECT_INCORRECT, 'malformed OLE document, stream too large') + # optimization(?): data is first a list of strings, and join() is called + # at the end to concatenate all in one string. + # (this may not be really useful with recent Python versions) + data = [] + # if size is zero, then first sector index should be ENDOFCHAIN: + if size == 0 and sect != ENDOFCHAIN: + log.debug('size == 0 and sect != ENDOFCHAIN:') + self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE sector index for empty stream') + #[PL] A fixed-length for loop is used instead of an undefined while + # loop to avoid DoS attacks: + for i in range(nb_sectors): + log.debug('Reading stream sector[%d] = %Xh' % (i, sect)) + # Sector index may be ENDOFCHAIN, but only if size was unknown + if sect == ENDOFCHAIN: + if unknown_size: + log.debug('Reached ENDOFCHAIN sector for stream with unknown size') + break + else: + # else this means that the stream is smaller than declared: + log.debug('sect=ENDOFCHAIN before expected size') + self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE stream') + # sector index should be within FAT: + if sect<0 or sect>=len(fat): + log.debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat))) + log.debug('i=%d / nb_sectors=%d' %(i, nb_sectors)) +## tmp_data = b"".join(data) +## f = open('test_debug.bin', 'wb') +## f.write(tmp_data) +## f.close() +## log.debug('data read so far: %d bytes' % len(tmp_data)) + self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range') + # stop reading here if the exception is ignored: + break + #TODO: merge this code with OleFileIO.getsect() ? + #TODO: check if this works with 4K sectors: + try: + fp.seek(offset + sectorsize * sect) + except: + log.debug('sect=%d, seek=%d, filesize=%d' % + (sect, offset+sectorsize*sect, filesize)) + self.ole._raise_defect(DEFECT_INCORRECT, 'OLE sector index out of range') + # stop reading here if the exception is ignored: + break + sector_data = fp.read(sectorsize) + # [PL] check if there was enough data: + # Note: if sector is the last of the file, sometimes it is not a + # complete sector (of 512 or 4K), so we may read less than + # sectorsize. + if len(sector_data)!=sectorsize and sect!=(len(fat)-1): + log.debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' % + (sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data))) + log.debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data))) + self.ole._raise_defect(DEFECT_INCORRECT, 'incomplete OLE sector') + data.append(sector_data) + # jump to next sector in the FAT: + try: + sect = fat[sect] & 0xFFFFFFFF # JYTHON-WORKAROUND + except IndexError: + # [PL] if pointer is out of the FAT an exception is raised + self.ole._raise_defect(DEFECT_INCORRECT, 'incorrect OLE FAT, sector index out of range') + # stop reading here if the exception is ignored: + break + #[PL] Last sector should be a "end of chain" marker: + # if sect != ENDOFCHAIN: + # raise IOError('incorrect last sector index in OLE stream') + data = b"".join(data) + # Data is truncated to the actual stream size: + if len(data) >= size: + log.debug('Read data of length %d, truncated to stream size %d' % (len(data), size)) + data = data[:size] + # actual stream size is stored for future use: + self.size = size + elif unknown_size: + # actual stream size was not known, now we know the size of read + # data: + log.debug('Read data of length %d, the stream size was unkown' % len(data)) + self.size = len(data) + else: + # read data is less than expected: + log.debug('Read data of length %d, less than expected stream size %d' % (len(data), size)) + # TODO: provide details in exception message + self.ole._raise_defect(DEFECT_INCORRECT, 'OLE stream size is less than declared') + self.size = len(data) + # when all data is read in memory, BytesIO constructor is called + io.BytesIO.__init__(self, data) + # Then the OleStream object can be used as a read-only file object. + + +#--- OleDirectoryEntry ------------------------------------------------------- + +class OleDirectoryEntry: + + """ + OLE2 Directory Entry + """ + #[PL] parsing code moved from OleFileIO.loaddirectory + + # struct to parse directory entries: + # <: little-endian byte order, standard sizes + # (note: this should guarantee that Q returns a 64 bits int) + # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes + # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 + # B: uint8, dir entry type (between 0 and 5) + # B: uint8, color: 0=black, 1=red + # I: uint32, index of left child node in the red-black tree, NOSTREAM if none + # I: uint32, index of right child node in the red-black tree, NOSTREAM if none + # I: uint32, index of child root node if it is a storage, else NOSTREAM + # 16s: CLSID, unique identifier (only used if it is a storage) + # I: uint32, user flags + # Q (was 8s): uint64, creation timestamp or zero + # Q (was 8s): uint64, modification timestamp or zero + # I: uint32, SID of first sector if stream or ministream, SID of 1st sector + # of stream containing ministreams if root entry, 0 otherwise + # I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise + # I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise + STRUCT_DIRENTRY = '<64sHBBIII16sIQQIII' + # size of a directory entry: 128 bytes + DIRENTRY_SIZE = 128 + assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE + + + def __init__(self, entry, sid, olefile): + """ + Constructor for an OleDirectoryEntry object. + Parses a 128-bytes entry from the OLE Directory stream. + + :param entry : string (must be 128 bytes long) + :param sid : index of this directory entry in the OLE file directory + :param olefile: OleFileIO containing this directory entry + """ + self.sid = sid + # ref to olefile is stored for future use + self.olefile = olefile + # kids is a list of children entries, if this entry is a storage: + # (list of OleDirectoryEntry objects) + self.kids = [] + # kids_dict is a dictionary of children entries, indexed by their + # name in lowercase: used to quickly find an entry, and to detect + # duplicates + self.kids_dict = {} + # flag used to detect if the entry is referenced more than once in + # directory: + self.used = False + # decode DirEntry + ( + self.name_raw, # 64s: string containing entry name in unicode UTF-16 (max 31 chars) + null char = 64 bytes + self.namelength, # H: uint16, number of bytes used in name buffer, including null = (len+1)*2 + self.entry_type, + self.color, + self.sid_left, + self.sid_right, + self.sid_child, + clsid, + self.dwUserFlags, + self.createTime, + self.modifyTime, + self.isectStart, + self.sizeLow, + self.sizeHigh + ) = struct.unpack(OleDirectoryEntry.STRUCT_DIRENTRY, entry) + if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]: + olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type') + # only first directory entry can (and should) be root: + if self.entry_type == STGTY_ROOT and sid != 0: + olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry') + if sid == 0 and self.entry_type != STGTY_ROOT: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry') + #log.debug(struct.unpack(fmt_entry, entry[:len_entry])) + # name should be at most 31 unicode characters + null character, + # so 64 bytes in total (31*2 + 2): + if self.namelength>64: + olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length >64 bytes') + # if exception not raised, namelength is set to the maximum value: + self.namelength = 64 + # only characters without ending null char are kept: + self.name_utf16 = self.name_raw[:(self.namelength-2)] + #TODO: check if the name is actually followed by a null unicode character ([MS-CFB] 2.6.1) + #TODO: check if the name does not contain forbidden characters: + # [MS-CFB] 2.6.1: "The following characters are illegal and MUST NOT be part of the name: '/', '\', ':', '!'." + # name is converted from UTF-16LE to the path encoding specified in the OleFileIO: + self.name = olefile._decode_utf16_str(self.name_utf16) + + log.debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name))) + log.debug(' - type: %d' % self.entry_type) + log.debug(' - sect: %Xh' % self.isectStart) + log.debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left, + self.sid_right, self.sid_child)) + + # sizeHigh is only used for 4K sectors, it should be zero for 512 bytes + # sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1 + # or some other value so it cannot be raised as a defect in general: + if olefile.sectorsize == 512: + if self.sizeHigh != 0 and self.sizeHigh != 0xFFFFFFFF: + log.debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' % + (olefile.sectorsize, self.sizeLow, self.sizeHigh, self.sizeHigh)) + olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size') + self.size = self.sizeLow + else: + self.size = self.sizeLow + (long(self.sizeHigh)<<32) + log.debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, self.sizeLow, self.sizeHigh)) + + self.clsid = _clsid(clsid) + # a storage should have a null size, BUT some implementations such as + # Word 8 for Mac seem to allow non-null values => Potential defect: + if self.entry_type == STGTY_STORAGE and self.size != 0: + olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0') + # check if stream is not already referenced elsewhere: + if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0: + if self.size < olefile.minisectorcutoff \ + and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT + # ministream object + minifat = True + else: + minifat = False + olefile._check_duplicate_stream(self.isectStart, minifat) + + + + def build_storage_tree(self): + """ + Read and build the red-black tree attached to this OleDirectoryEntry + object, if it is a storage. + Note that this method builds a tree of all subentries, so it should + only be called for the root object once. + """ + log.debug('build_storage_tree: SID=%d - %s - sid_child=%d' + % (self.sid, repr(self.name), self.sid_child)) + if self.sid_child != NOSTREAM: + # if child SID is not NOSTREAM, then this entry is a storage. + # Let's walk through the tree of children to fill the kids list: + self.append_kids(self.sid_child) + + # Note from OpenOffice documentation: the safest way is to + # recreate the tree because some implementations may store broken + # red-black trees... + + # in the OLE file, entries are sorted on (length, name). + # for convenience, we sort them on name instead: + # (see rich comparison methods in this class) + self.kids.sort() + + + def append_kids(self, child_sid): + """ + Walk through red-black tree of children of this directory entry to add + all of them to the kids list. (recursive method) + + :param child_sid : index of child directory entry to use, or None when called + first time for the root. (only used during recursion) + """ + log.debug('append_kids: child_sid=%d' % child_sid) + #[PL] this method was added to use simple recursion instead of a complex + # algorithm. + # if this is not a storage or a leaf of the tree, nothing to do: + if child_sid == NOSTREAM: + return + # check if child SID is in the proper range: + if child_sid<0 or child_sid>=len(self.olefile.direntries): + self.olefile._raise_defect(DEFECT_INCORRECT, 'OLE DirEntry index out of range') + else: + # get child direntry: + child = self.olefile._load_direntry(child_sid) #direntries[child_sid] + log.debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d' + % (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child)) + # the directory entries are organized as a red-black tree. + # (cf. Wikipedia for details) + # First walk through left side of the tree: + self.append_kids(child.sid_left) + # Check if its name is not already used (case-insensitive): + name_lower = child.name.lower() + if name_lower in self.kids_dict: + self.olefile._raise_defect(DEFECT_INCORRECT, + "Duplicate filename in OLE storage") + # Then the child_sid OleDirectoryEntry object is appended to the + # kids list and dictionary: + self.kids.append(child) + self.kids_dict[name_lower] = child + # Check if kid was not already referenced in a storage: + if child.used: + self.olefile._raise_defect(DEFECT_INCORRECT, + 'OLE Entry referenced more than once') + child.used = True + # Finally walk through right side of the tree: + self.append_kids(child.sid_right) + # Afterwards build kid's own tree if it's also a storage: + child.build_storage_tree() + + + def __eq__(self, other): + "Compare entries by name" + return self.name == other.name + + def __lt__(self, other): + "Compare entries by name" + return self.name < other.name + + def __ne__(self, other): + return not self.__eq__(other) + + def __le__(self, other): + return self.__eq__(other) or self.__lt__(other) + + # Reflected __lt__() and __le__() will be used for __gt__() and __ge__() + + #TODO: replace by the same function as MS implementation ? + # (order by name length first, then case-insensitive order) + + + def dump(self, tab = 0): + "Dump this entry, and all its subentries (for debug purposes only)" + TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)", + "(property)", "(root)"] + print(" "*tab + repr(self.name), TYPES[self.entry_type], end=' ') + if self.entry_type in (STGTY_STREAM, STGTY_ROOT): + print(self.size, "bytes", end=' ') + print() + if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid: + print(" "*tab + "{%s}" % self.clsid) + + for kid in self.kids: + kid.dump(tab + 2) + + + def getmtime(self): + """ + Return modification time of a directory entry. + + :returns: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + if self.modifyTime == 0: + return None + return filetime2datetime(self.modifyTime) + + + def getctime(self): + """ + Return creation time of a directory entry. + + :returns: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + if self.createTime == 0: + return None + return filetime2datetime(self.createTime) + + +#--- OleFileIO ---------------------------------------------------------------- + +class OleFileIO: + """ + OLE container object + + This class encapsulates the interface to an OLE 2 structured + storage file. Use the listdir and openstream methods to + access the contents of this file. + + Object names are given as a list of strings, one for each subentry + level. The root entry should be omitted. For example, the following + code extracts all image streams from a Microsoft Image Composer file:: + + ole = OleFileIO("fan.mic") + + for entry in ole.listdir(): + if entry[1:2] == "Image": + fin = ole.openstream(entry) + fout = open(entry[0:1], "wb") + while True: + s = fin.read(8192) + if not s: + break + fout.write(s) + + You can use the viewer application provided with the Python Imaging + Library to view the resulting files (which happens to be standard + TIFF files). + """ + + def __init__(self, filename=None, raise_defects=DEFECT_FATAL, + write_mode=False, debug=False, path_encoding=DEFAULT_PATH_ENCODING): + """ + Constructor for the OleFileIO class. + + :param filename: file to open. + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read, seek and tell methods), + it is parsed as-is. + + :param raise_defects: minimal level for defects to be raised as exceptions. + (use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a + security-oriented application, see source code for details) + + :param write_mode: bool, if True the file is opened in read/write mode instead + of read-only by default. + + :param debug: bool, set debug mode (deprecated, not used anymore) + + :param path_encoding: None or str, name of the codec to use for path + names (streams and storages), or None for Unicode. + Unicode by default on Python 3+, UTF-8 on Python 2.x. + (new in olefile 0.42, was hardcoded to Latin-1 until olefile v0.41) + """ + # minimal level for defects to be raised as exceptions: + self._raise_defects_level = raise_defects + # list of defects/issues not raised as exceptions: + # tuples of (exception type, message) + self.parsing_issues = [] + self.write_mode = write_mode + self.path_encoding = path_encoding + self._filesize = None + self.fp = None + if filename: + self.open(filename, write_mode=write_mode) + + + def _raise_defect(self, defect_level, message, exception_type=IOError): + """ + This method should be called for any defect found during file parsing. + It may raise an IOError exception according to the minimal level chosen + for the OleFileIO object. + + :param defect_level: defect level, possible values are: + + - DEFECT_UNSURE : a case which looks weird, but not sure it's a defect + - DEFECT_POTENTIAL : a potential defect + - DEFECT_INCORRECT : an error according to specifications, but parsing can go on + - DEFECT_FATAL : an error which cannot be ignored, parsing is impossible + + :param message: string describing the defect, used with raised exception. + :param exception_type: exception class to be raised, IOError by default + """ + # added by [PL] + if defect_level >= self._raise_defects_level: + log.error(message) + raise exception_type(message) + else: + # just record the issue, no exception raised: + self.parsing_issues.append((exception_type, message)) + log.warning(message) + + + def _decode_utf16_str(self, utf16_str, errors='replace'): + """ + Decode a string encoded in UTF-16 LE format, as found in the OLE + directory or in property streams. Return a string encoded + according to the path_encoding specified for the OleFileIO object. + + :param utf16_str: bytes string encoded in UTF-16 LE format + :param errors: str, see python documentation for str.decode() + :return: str, encoded according to path_encoding + """ + unicode_str = utf16_str.decode('UTF-16LE', errors) + if self.path_encoding: + # an encoding has been specified for path names: + return unicode_str.encode(self.path_encoding, errors) + else: + # path_encoding=None, return the Unicode string as-is: + return unicode_str + + + def open(self, filename, write_mode=False): + """ + Open an OLE2 file in read-only or read/write mode. + Read and parse the header, FAT and directory. + + :param filename: string-like or file-like object, OLE file to parse + + - if filename is a string smaller than 1536 bytes, it is the path + of the file to open. (bytes or unicode string) + - if filename is a string longer than 1535 bytes, it is parsed + as the content of an OLE file in memory. (bytes type only) + - if filename is a file-like object (with read, seek and tell methods), + it is parsed as-is. + + :param write_mode: bool, if True the file is opened in read/write mode instead + of read-only by default. (ignored if filename is not a path) + """ + self.write_mode = write_mode + #[PL] check if filename is a string-like or file-like object: + # (it is better to check for a read() method) + if hasattr(filename, 'read'): + #TODO: also check seek and tell methods? + # file-like object: use it directly + self.fp = filename + elif isinstance(filename, bytes) and len(filename) >= MINIMAL_OLEFILE_SIZE: + # filename is a bytes string containing the OLE file to be parsed: + # convert it to BytesIO + self.fp = io.BytesIO(filename) + else: + # string-like object: filename of file on disk + if self.write_mode: + # open file in mode 'read with update, binary' + # According to https://docs.python.org/2/library/functions.html#open + # 'w' would truncate the file, 'a' may only append on some Unixes + mode = 'r+b' + else: + # read-only mode by default + mode = 'rb' + self.fp = open(filename, mode) + # obtain the filesize by using seek and tell, which should work on most + # file-like objects: + #TODO: do it above, using getsize with filename when possible? + #TODO: fix code to fail with clear exception when filesize cannot be obtained + filesize=0 + self.fp.seek(0, os.SEEK_END) + try: + filesize = self.fp.tell() + finally: + self.fp.seek(0) + self._filesize = filesize + log.debug('File size: %d bytes (%Xh)' % (self._filesize, self._filesize)) + + # lists of streams in FAT and MiniFAT, to detect duplicate references + # (list of indexes of first sectors of each stream) + self._used_streams_fat = [] + self._used_streams_minifat = [] + + header = self.fp.read(512) + + if len(header) != 512 or header[:8] != MAGIC: + log.debug('Magic = %r instead of %r' % (header[:8], MAGIC)) + self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file") + + # [PL] header structure according to AAF specifications: + ##Header + ##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)] + ##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, + ## // 0x1a, 0xe1} for current version + ##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/ + ## // GetClassFile uses root directory class id) + ##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is + ## // written by reference implementation + ##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for + ## // 512-byte sectors, 4 for 4 KB sectors + ##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering + ##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two; + ## // typically 9 indicating 512-byte sectors + ##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two; + ## // typically 6 indicating 64-byte mini-sectors + ##USHORT _usReserved; // [22H,02] reserved, must be zero + ##ULONG _ulReserved1; // [24H,04] reserved, must be zero + ##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors, + ## // number of SECTs in directory chain for 4 KB + ## // sectors + ##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain + ##SECT _sectDirStart; // [30H,04] first SECT in the directory chain + ##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must + ## // be zero. The reference implementation + ## // does not support transactions + ##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream; + ## // typically 4096 bytes + ##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain + ##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain + ##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain + ##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain + ##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors + ##}; + + # [PL] header decoding: + # '<' indicates little-endian byte ordering for Intel (cf. struct module help) + fmt_header = '<8s16sHHHHHHLLLLLLLLLL' + header_size = struct.calcsize(fmt_header) + log.debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) ) + header1 = header[:header_size] + ( + self.header_signature, + self.header_clsid, + self.minor_version, + self.dll_version, + self.byte_order, + self.sector_shift, + self.mini_sector_shift, + self.reserved1, + self.reserved2, + self.num_dir_sectors, + self.num_fat_sectors, + self.first_dir_sector, + self.transaction_signature_number, + self.mini_stream_cutoff_size, + self.first_mini_fat_sector, + self.num_mini_fat_sectors, + self.first_difat_sector, + self.num_difat_sectors + ) = struct.unpack(fmt_header, header1) + log.debug( struct.unpack(fmt_header, header1)) + + if self.header_signature != MAGIC: + # OLE signature should always be present + self._raise_defect(DEFECT_FATAL, "incorrect OLE signature") + if self.header_clsid != bytearray(16): + # according to AAF specs, CLSID should always be zero + self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header") + log.debug( "Minor Version = %d" % self.minor_version ) + # TODO: according to MS-CFB, minor version should be 0x003E + log.debug( "DLL Version = %d (expected: 3 or 4)" % self.dll_version ) + if self.dll_version not in [3, 4]: + # version 3: usual format, 512 bytes per sector + # version 4: large format, 4K per sector + self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header") + log.debug( "Byte Order = %X (expected: FFFE)" % self.byte_order ) + if self.byte_order != 0xFFFE: + # For now only common little-endian documents are handled correctly + self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header") + # TODO: add big-endian support for documents created on Mac ? + # But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE. + self.sector_size = 2**self.sector_shift + log.debug( "Sector Size = %d bytes (expected: 512 or 4096)" % self.sector_size ) + if self.sector_size not in [512, 4096]: + self._raise_defect(DEFECT_INCORRECT, "incorrect sector_size in OLE header") + if (self.dll_version==3 and self.sector_size!=512) \ + or (self.dll_version==4 and self.sector_size!=4096): + self._raise_defect(DEFECT_INCORRECT, "sector_size does not match DllVersion in OLE header") + self.mini_sector_size = 2**self.mini_sector_shift + log.debug( "MiniFAT Sector Size = %d bytes (expected: 64)" % self.mini_sector_size ) + if self.mini_sector_size not in [64]: + self._raise_defect(DEFECT_INCORRECT, "incorrect mini_sector_size in OLE header") + if self.reserved1 != 0 or self.reserved2 != 0: + self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)") + log.debug( "Number of Directory sectors = %d" % self.num_dir_sectors ) + # Number of directory sectors (only allowed if DllVersion != 3) + if self.sector_size==512 and self.num_dir_sectors!=0: + self._raise_defect(DEFECT_INCORRECT, "incorrect number of directory sectors in OLE header") + log.debug( "Number of FAT sectors = %d" % self.num_fat_sectors ) + # num_fat_sectors = number of FAT sectors in the file + log.debug( "First Directory sector = %Xh" % self.first_dir_sector ) + # first_dir_sector = 1st sector containing the directory + log.debug( "Transaction Signature Number = %d" % self.transaction_signature_number ) + # Signature should be zero, BUT some implementations do not follow this + # rule => only a potential defect: + # (according to MS-CFB, may be != 0 for applications supporting file + # transactions) + if self.transaction_signature_number != 0: + self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (transaction_signature_number>0)") + log.debug( "Mini Stream cutoff size = %Xh (expected: 1000h)" % self.mini_stream_cutoff_size ) + # MS-CFB: This integer field MUST be set to 0x00001000. This field + # specifies the maximum size of a user-defined data stream allocated + # from the mini FAT and mini stream, and that cutoff is 4096 bytes. + # Any user-defined data stream larger than or equal to this cutoff size + # must be allocated as normal sectors from the FAT. + if self.mini_stream_cutoff_size != 0x1000: + self._raise_defect(DEFECT_INCORRECT, "incorrect mini_stream_cutoff_size in OLE header") + # if no exception is raised, the cutoff size is fixed to 0x1000 + log.warning('Fixing the mini_stream_cutoff_size to 4096 (mandatory value) instead of %d' % + self.mini_stream_cutoff_size) + self.mini_stream_cutoff_size = 0x1000 + # TODO: check if these values are OK + log.debug( "First MiniFAT sector = %Xh" % self.first_mini_fat_sector ) + log.debug( "Number of MiniFAT sectors = %d" % self.num_mini_fat_sectors ) + log.debug( "First DIFAT sector = %Xh" % self.first_difat_sector ) + log.debug( "Number of DIFAT sectors = %d" % self.num_difat_sectors ) + + # calculate the number of sectors in the file + # (-1 because header doesn't count) + self.nb_sect = ( (filesize + self.sector_size-1) // self.sector_size) - 1 + log.debug( "Maximum number of sectors in the file: %d (%Xh)" % (self.nb_sect, self.nb_sect)) + #TODO: change this test, because an OLE file MAY contain other data + # after the last sector. + + # file clsid + self.header_clsid = _clsid(header[8:24]) + + #TODO: remove redundant attributes, and fix the code which uses them? + self.sectorsize = self.sector_size #1 << i16(header, 30) + self.minisectorsize = self.mini_sector_size #1 << i16(header, 32) + self.minisectorcutoff = self.mini_stream_cutoff_size # i32(header, 56) + + # check known streams for duplicate references (these are always in FAT, + # never in MiniFAT): + self._check_duplicate_stream(self.first_dir_sector) + # check MiniFAT only if it is not empty: + if self.num_mini_fat_sectors: + self._check_duplicate_stream(self.first_mini_fat_sector) + # check DIFAT only if it is not empty: + if self.num_difat_sectors: + self._check_duplicate_stream(self.first_difat_sector) + + # Load file allocation tables + self.loadfat(header) + # Load directory. This sets both the direntries list (ordered by sid) + # and the root (ordered by hierarchy) members. + self.loaddirectory(self.first_dir_sector) + self.ministream = None + self.minifatsect = self.first_mini_fat_sector + + + def close(self): + """ + close the OLE file, to release the file object + """ + self.fp.close() + + + def _check_duplicate_stream(self, first_sect, minifat=False): + """ + Checks if a stream has not been already referenced elsewhere. + This method should only be called once for each known stream, and only + if stream size is not null. + + :param first_sect: int, index of first sector of the stream in FAT + :param minifat: bool, if True, stream is located in the MiniFAT, else in the FAT + """ + if minifat: + log.debug('_check_duplicate_stream: sect=%Xh in MiniFAT' % first_sect) + used_streams = self._used_streams_minifat + else: + log.debug('_check_duplicate_stream: sect=%Xh in FAT' % first_sect) + # some values can be safely ignored (not a real stream): + if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT): + return + used_streams = self._used_streams_fat + #TODO: would it be more efficient using a dict or hash values, instead + # of a list of long ? + if first_sect in used_streams: + self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice') + else: + used_streams.append(first_sect) + + + def dumpfat(self, fat, firstindex=0): + """ + Display a part of FAT in human-readable form for debugging purposes + """ + # dictionary to convert special FAT values in human-readable strings + VPL = 8 # values per line (8+1 * 8+1 = 81) + fatnames = { + FREESECT: "..free..", + ENDOFCHAIN: "[ END. ]", + FATSECT: "FATSECT ", + DIFSECT: "DIFSECT " + } + nbsect = len(fat) + nlines = (nbsect+VPL-1)//VPL + print("index", end=" ") + for i in range(VPL): + print("%8X" % i, end=" ") + print() + for l in range(nlines): + index = l*VPL + print("%6X:" % (firstindex+index), end=" ") + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = fat[i] + aux = sect & 0xFFFFFFFF # JYTHON-WORKAROUND + if aux in fatnames: + name = fatnames[aux] + else: + if sect == i+1: + name = " --->" + else: + name = "%8X" % sect + print(name, end=" ") + print() + + + def dumpsect(self, sector, firstindex=0): + """ + Display a sector in a human-readable form, for debugging purposes + """ + VPL=8 # number of values per line (8+1 * 8+1 = 81) + tab = array.array(UINT32, sector) + if sys.byteorder == 'big': + tab.byteswap() + nbsect = len(tab) + nlines = (nbsect+VPL-1)//VPL + print("index", end=" ") + for i in range(VPL): + print("%8X" % i, end=" ") + print() + for l in range(nlines): + index = l*VPL + print("%6X:" % (firstindex+index), end=" ") + for i in range(index, index+VPL): + if i>=nbsect: + break + sect = tab[i] + name = "%8X" % sect + print(name, end=" ") + print() + + def sect2array(self, sect): + """ + convert a sector to an array of 32 bits unsigned integers, + swapping bytes on big endian CPUs such as PowerPC (old Macs) + """ + a = array.array(UINT32, sect) + # if CPU is big endian, swap bytes: + if sys.byteorder == 'big': + a.byteswap() + return a + + + def loadfat_sect(self, sect): + """ + Adds the indexes of the given sector to the FAT + + :param sect: string containing the first FAT sector, or array of long integers + :returns: index of last FAT sector. + """ + # a FAT sector is an array of ulong integers. + if isinstance(sect, array.array): + # if sect is already an array it is directly used + fat1 = sect + else: + # if it's a raw sector, it is parsed in an array + fat1 = self.sect2array(sect) + # Display the sector contents only if the logging level is debug: + if log.isEnabledFor(logging.DEBUG): + self.dumpsect(sect) + # The FAT is a sector chain starting at the first index of itself. + # initialize isect, just in case: + isect = None + for isect in fat1: + isect = isect & 0xFFFFFFFF # JYTHON-WORKAROUND + log.debug("isect = %X" % isect) + if isect == ENDOFCHAIN or isect == FREESECT: + # the end of the sector chain has been reached + log.debug("found end of sector chain") + break + # read the FAT sector + s = self.getsect(isect) + # parse it as an array of 32 bits integers, and add it to the + # global FAT array + nextfat = self.sect2array(s) + self.fat = self.fat + nextfat + return isect + + + def loadfat(self, header): + """ + Load the FAT table. + """ + # The 1st sector of the file contains sector numbers for the first 109 + # FAT sectors, right after the header which is 76 bytes long. + # (always 109, whatever the sector size: 512 bytes = 76+4*109) + # Additional sectors are described by DIF blocks + + log.debug('Loading the FAT table, starting with the 1st sector after the header') + sect = header[76:512] + log.debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) ) + #fat = [] + # [PL] FAT is an array of 32 bits unsigned ints, it's more effective + # to use an array than a list in Python. + # It's initialized as empty first: + self.fat = array.array(UINT32) + self.loadfat_sect(sect) + #self.dumpfat(self.fat) +## for i in range(0, len(sect), 4): +## ix = i32(sect, i) +## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: +## if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF: +## break +## s = self.getsect(ix) +## #fat = fat + [i32(s, i) for i in range(0, len(s), 4)] +## fat = fat + array.array(UINT32, s) + if self.num_difat_sectors != 0: + log.debug('DIFAT is used, because file size > 6.8MB.') + # [PL] There's a DIFAT because file is larger than 6.8MB + # some checks just in case: + if self.num_fat_sectors <= 109: + # there must be at least 109 blocks in header and the rest in + # DIFAT, so number of sectors must be >109. + self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors') + if self.first_difat_sector >= self.nb_sect: + # initial DIFAT block index must be valid + self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range') + log.debug( "DIFAT analysis..." ) + # We compute the necessary number of DIFAT sectors : + # Number of pointers per DIFAT sector = (sectorsize/4)-1 + # (-1 because the last pointer is the next DIFAT sector number) + nb_difat_sectors = (self.sectorsize//4)-1 + # (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next DIFAT sector) + nb_difat = (self.num_fat_sectors-109 + nb_difat_sectors-1)//nb_difat_sectors + log.debug( "nb_difat = %d" % nb_difat ) + if self.num_difat_sectors != nb_difat: + raise IOError('incorrect DIFAT') + isect_difat = self.first_difat_sector + for i in iterrange(nb_difat): + log.debug( "DIFAT block %d, sector %X" % (i, isect_difat) ) + #TODO: check if corresponding FAT SID = DIFSECT + sector_difat = self.getsect(isect_difat) + difat = self.sect2array(sector_difat) + # Display the sector contents only if the logging level is debug: + if log.isEnabledFor(logging.DEBUG): + self.dumpsect(sector_difat) + self.loadfat_sect(difat[:nb_difat_sectors]) + # last DIFAT pointer is next DIFAT sector: + isect_difat = difat[nb_difat_sectors] + log.debug( "next DIFAT sector: %X" % isect_difat ) + # checks: + if isect_difat not in [ENDOFCHAIN, FREESECT]: + # last DIFAT pointer value must be ENDOFCHAIN or FREESECT + raise IOError('incorrect end of DIFAT') +## if len(self.fat) != self.num_fat_sectors: +## # FAT should contain num_fat_sectors blocks +## print("FAT length: %d instead of %d" % (len(self.fat), self.num_fat_sectors)) +## raise IOError('incorrect DIFAT') + else: + log.debug('No DIFAT, because file size < 6.8MB.') + # since FAT is read from fixed-size sectors, it may contain more values + # than the actual number of sectors in the file. + # Keep only the relevant sector indexes: + if len(self.fat) > self.nb_sect: + log.debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect)) + self.fat = self.fat[:self.nb_sect] + log.debug('FAT references %d sectors / Maximum %d sectors in file' % (len(self.fat), self.nb_sect)) + # Display the FAT contents only if the logging level is debug: + if log.isEnabledFor(logging.DEBUG): + log.debug('\nFAT:') + self.dumpfat(self.fat) + + + def loadminifat(self): + """ + Load the MiniFAT table. + """ + # MiniFAT is stored in a standard sub-stream, pointed to by a header + # field. + # NOTE: there are two sizes to take into account for this stream: + # 1) Stream size is calculated according to the number of sectors + # declared in the OLE header. This allocated stream may be more than + # needed to store the actual sector indexes. + # (self.num_mini_fat_sectors is the number of sectors of size self.sector_size) + stream_size = self.num_mini_fat_sectors * self.sector_size + # 2) Actually used size is calculated by dividing the MiniStream size + # (given by root entry size) by the size of mini sectors, *4 for + # 32 bits indexes: + nb_minisectors = (self.root.size + self.mini_sector_size-1) // self.mini_sector_size + used_size = nb_minisectors * 4 + log.debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' % + (self.minifatsect, self.num_mini_fat_sectors, used_size, stream_size, nb_minisectors)) + if used_size > stream_size: + # This is not really a problem, but may indicate a wrong implementation: + self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT') + # In any case, first read stream_size: + s = self._open(self.minifatsect, stream_size, force_FAT=True).read() + #[PL] Old code replaced by an array: + #self.minifat = [i32(s, i) for i in range(0, len(s), 4)] + self.minifat = self.sect2array(s) + # Then shrink the array to used size, to avoid indexes out of MiniStream: + log.debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors)) + self.minifat = self.minifat[:nb_minisectors] + log.debug('loadminifat(): len=%d' % len(self.minifat)) + # Display the FAT contents only if the logging level is debug: + if log.isEnabledFor(logging.DEBUG): + log.debug('\nMiniFAT:') + self.dumpfat(self.minifat) + + def getsect(self, sect): + """ + Read given sector from file on disk. + + :param sect: int, sector index + :returns: a string containing the sector data. + """ + # From [MS-CFB]: A sector number can be converted into a byte offset + # into the file by using the following formula: + # (sector number + 1) x Sector Size. + # This implies that sector #0 of the file begins at byte offset Sector + # Size, not at 0. + + # [PL] the original code in PIL was wrong when sectors are 4KB instead of + # 512 bytes: + #self.fp.seek(512 + self.sectorsize * sect) + #[PL]: added safety checks: + #print("getsect(%X)" % sect) + try: + self.fp.seek(self.sectorsize * (sect+1)) + except: + log.debug('getsect(): sect=%X, seek=%d, filesize=%d' % + (sect, self.sectorsize*(sect+1), self._filesize)) + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') + sector = self.fp.read(self.sectorsize) + if len(sector) != self.sectorsize: + log.debug('getsect(): sect=%X, read=%d, sectorsize=%d' % + (sect, len(sector), self.sectorsize)) + self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector') + return sector + + + def write_sect(self, sect, data, padding=b'\x00'): + """ + Write given sector to file on disk. + + :param sect: int, sector index + :param data: bytes, sector data + :param padding: single byte, padding character if data < sector size + """ + if not isinstance(data, bytes): + raise TypeError("write_sect: data must be a bytes string") + if not isinstance(padding, bytes) or len(padding)!=1: + raise TypeError("write_sect: padding must be a bytes string of 1 char") + #TODO: we could allow padding=None for no padding at all + try: + self.fp.seek(self.sectorsize * (sect+1)) + except: + log.debug('write_sect(): sect=%X, seek=%d, filesize=%d' % + (sect, self.sectorsize*(sect+1), self._filesize)) + self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range') + if len(data) < self.sectorsize: + # add padding + data += padding * (self.sectorsize - len(data)) + elif len(data) < self.sectorsize: + raise ValueError("Data is larger than sector size") + self.fp.write(data) + + + def loaddirectory(self, sect): + """ + Load the directory. + + :param sect: sector index of directory stream. + """ + log.debug('Loading the Directory:') + # The directory is stored in a standard + # substream, independent of its size. + + # open directory stream as a read-only file: + # (stream size is not known in advance) + self.directory_fp = self._open(sect) + + #[PL] to detect malformed documents and avoid DoS attacks, the maximum + # number of directory entries can be calculated: + max_entries = self.directory_fp.size // 128 + log.debug('loaddirectory: size=%d, max_entries=%d' % + (self.directory_fp.size, max_entries)) + + # Create list of directory entries + #self.direntries = [] + # We start with a list of "None" object + self.direntries = [None] * max_entries +## for sid in iterrange(max_entries): +## entry = fp.read(128) +## if not entry: +## break +## self.direntries.append(OleDirectoryEntry(entry, sid, self)) + # load root entry: + root_entry = self._load_direntry(0) + # Root entry is the first entry: + self.root = self.direntries[0] + # TODO: read ALL directory entries (ignore bad entries?) + # TODO: adapt build_storage_tree to avoid duplicate reads + # for i in range(1, max_entries): + # self._load_direntry(i) + # read and build all storage trees, starting from the root: + self.root.build_storage_tree() + + + def _load_direntry (self, sid): + """ + Load a directory entry from the directory. + This method should only be called once for each storage/stream when + loading the directory. + + :param sid: index of storage/stream in the directory. + :returns: a OleDirectoryEntry object + + :exception IOError: if the entry has always been referenced. + """ + # check if SID is OK: + if sid<0 or sid>=len(self.direntries): + self._raise_defect(DEFECT_FATAL, "OLE directory index out of range") + # check if entry was already referenced: + if self.direntries[sid] is not None: + self._raise_defect(DEFECT_INCORRECT, + "double reference for OLE stream/storage") + # if exception not raised, return the object + return self.direntries[sid] + self.directory_fp.seek(sid * 128) + entry = self.directory_fp.read(128) + self.direntries[sid] = OleDirectoryEntry(entry, sid, self) + return self.direntries[sid] + + + def dumpdirectory(self): + """ + Dump directory (for debugging only) + """ + self.root.dump() + + + def _open(self, start, size = UNKNOWN_SIZE, force_FAT=False): + """ + Open a stream, either in FAT or MiniFAT according to its size. + (openstream helper) + + :param start: index of first sector + :param size: size of stream (or nothing if size is unknown) + :param force_FAT: if False (default), stream will be opened in FAT or MiniFAT + according to size. If True, it will always be opened in FAT. + """ + log.debug('OleFileIO.open(): sect=%Xh, size=%d, force_FAT=%s' % + (start, size, str(force_FAT))) + # stream size is compared to the mini_stream_cutoff_size threshold: + if size < self.minisectorcutoff and not force_FAT: + # ministream object + if not self.ministream: + # load MiniFAT if it wasn't already done: + self.loadminifat() + # The first sector index of the miniFAT stream is stored in the + # root directory entry: + size_ministream = self.root.size + log.debug('Opening MiniStream: sect=%Xh, size=%d' % + (self.root.isectStart, size_ministream)) + self.ministream = self._open(self.root.isectStart, + size_ministream, force_FAT=True) + return OleStream(fp=self.ministream, sect=start, size=size, + offset=0, sectorsize=self.minisectorsize, + fat=self.minifat, filesize=self.ministream.size, + olefileio=self) + else: + # standard stream + return OleStream(fp=self.fp, sect=start, size=size, + offset=self.sectorsize, + sectorsize=self.sectorsize, fat=self.fat, + filesize=self._filesize, + olefileio=self) + + + def _list(self, files, prefix, node, streams=True, storages=False): + """ + listdir helper + + :param files: list of files to fill in + :param prefix: current location in storage tree (list of names) + :param node: current node (OleDirectoryEntry object) + :param streams: bool, include streams if True (True by default) - new in v0.26 + :param storages: bool, include storages if True (False by default) - new in v0.26 + (note: the root storage is never included) + """ + prefix = prefix + [node.name] + for entry in node.kids: + if entry.entry_type == STGTY_STORAGE: + # this is a storage + if storages: + # add it to the list + files.append(prefix[1:] + [entry.name]) + # check its kids + self._list(files, prefix, entry, streams, storages) + elif entry.entry_type == STGTY_STREAM: + # this is a stream + if streams: + # add it to the list + files.append(prefix[1:] + [entry.name]) + else: + self._raise_defect(DEFECT_INCORRECT, 'The directory tree contains an entry which is not a stream nor a storage.') + + + def listdir(self, streams=True, storages=False): + """ + Return a list of streams and/or storages stored in this file + + :param streams: bool, include streams if True (True by default) - new in v0.26 + :param storages: bool, include storages if True (False by default) - new in v0.26 + (note: the root storage is never included) + :returns: list of stream and/or storage paths + """ + files = [] + self._list(files, [], self.root, streams, storages) + return files + + + def _find(self, filename): + """ + Returns directory entry of given filename. (openstream helper) + Note: this method is case-insensitive. + + :param filename: path of stream in storage tree (except root entry), either: + + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - or a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + + :returns: sid of requested filename + :exception IOError: if file not found + """ + + # if filename is a string instead of a list, split it on slashes to + # convert to a list: + if isinstance(filename, basestring): + filename = filename.split('/') + # walk across storage tree, following given path: + node = self.root + for name in filename: + for kid in node.kids: + if kid.name.lower() == name.lower(): + break + else: + raise IOError("file not found") + node = kid + return node.sid + + + def openstream(self, filename): + """ + Open a stream as a read-only file object (BytesIO). + Note: filename is case-insensitive. + + :param filename: path of stream in storage tree (except root entry), either: + + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - or a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + + :returns: file object (read-only) + :exception IOError: if filename not found, or if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + raise IOError("this file is not a stream") + return self._open(entry.isectStart, entry.size) + + + def write_stream(self, stream_name, data): + """ + Write a stream to disk. For now, it is only possible to replace an + existing stream by data of the same size. + + :param stream_name: path of stream in storage tree (except root entry), either: + + - a string using Unix path syntax, for example: + 'storage_1/storage_1.2/stream' + - or a list of storage filenames, path to the desired stream/storage. + Example: ['storage_1', 'storage_1.2', 'stream'] + + :param data: bytes, data to be written, must be the same size as the original + stream. + """ + if not isinstance(data, bytes): + raise TypeError("write_stream: data must be a bytes string") + sid = self._find(stream_name) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + raise IOError("this is not a stream") + size = entry.size + if size != len(data): + raise ValueError("write_stream: data must be the same size as the existing stream") + if size < self.minisectorcutoff: + raise NotImplementedError("Writing a stream in MiniFAT is not implemented yet") + sect = entry.isectStart + # number of sectors to write + nb_sectors = (size + (self.sectorsize-1)) // self.sectorsize + log.debug('nb_sectors = %d' % nb_sectors) + for i in range(nb_sectors): +## try: +## self.fp.seek(offset + self.sectorsize * sect) +## except: +## log.debug('sect=%d, seek=%d' % +## (sect, offset+self.sectorsize*sect)) +## raise IOError('OLE sector index out of range') + # extract one sector from data, the last one being smaller: + if i<(nb_sectors-1): + data_sector = data [i*self.sectorsize : (i+1)*self.sectorsize] + #TODO: comment this if it works + assert(len(data_sector)==self.sectorsize) + else: + data_sector = data [i*self.sectorsize:] + #TODO: comment this if it works + log.debug('write_stream: size=%d sectorsize=%d data_sector=%Xh size%%sectorsize=%d' + % (size, self.sectorsize, len(data_sector), size % self.sectorsize)) + assert(len(data_sector) % self.sectorsize==size % self.sectorsize) + self.write_sect(sect, data_sector) +## self.fp.write(data_sector) + # jump to next sector in the FAT: + try: + sect = self.fat[sect] + except IndexError: + # [PL] if pointer is out of the FAT an exception is raised + raise IOError('incorrect OLE FAT, sector index out of range') + #[PL] Last sector should be a "end of chain" marker: + if sect != ENDOFCHAIN: + raise IOError('incorrect last sector index in OLE stream') + + + def get_type(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container, and return its type. + + :param filename: path of stream in storage tree. (see openstream for syntax) + :returns: False if object does not exist, its entry type (>0) otherwise: + + - STGTY_STREAM: a stream + - STGTY_STORAGE: a storage + - STGTY_ROOT: the root entry + """ + try: + sid = self._find(filename) + entry = self.direntries[sid] + return entry.entry_type + except: + return False + + + def getmtime(self, filename): + """ + Return modification time of a stream/storage. + + :param filename: path of stream/storage in storage tree. (see openstream for + syntax) + :returns: None if modification time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + sid = self._find(filename) + entry = self.direntries[sid] + return entry.getmtime() + + + def getctime(self, filename): + """ + Return creation time of a stream/storage. + + :param filename: path of stream/storage in storage tree. (see openstream for + syntax) + :returns: None if creation time is null, a python datetime object + otherwise (UTC timezone) + + new in version 0.26 + """ + sid = self._find(filename) + entry = self.direntries[sid] + return entry.getctime() + + + def exists(self, filename): + """ + Test if given filename exists as a stream or a storage in the OLE + container. + Note: filename is case-insensitive. + + :param filename: path of stream in storage tree. (see openstream for syntax) + :returns: True if object exist, else False. + """ + try: + sid = self._find(filename) + return True + except: + return False + + + def get_size(self, filename): + """ + Return size of a stream in the OLE container, in bytes. + + :param filename: path of stream in storage tree (see openstream for syntax) + :returns: size in bytes (long integer) + :exception IOError: if file not found + :exception TypeError: if this is not a stream. + """ + sid = self._find(filename) + entry = self.direntries[sid] + if entry.entry_type != STGTY_STREAM: + #TODO: Should it return zero instead of raising an exception ? + raise TypeError('object is not an OLE stream') + return entry.size + + + def get_rootentry_name(self): + """ + Return root entry name. Should usually be 'Root Entry' or 'R' in most + implementations. + """ + return self.root.name + + + def getproperties(self, filename, convert_time=False, no_conversion=None): + """ + Return properties described in substream. + + :param filename: path of stream in storage tree (see openstream for syntax) + :param convert_time: bool, if True timestamps will be converted to Python datetime + :param no_conversion: None or list of int, timestamps not to be converted + (for example total editing time is not a real timestamp) + + :returns: a dictionary of values indexed by id (integer) + """ + #REFERENCE: [MS-OLEPS] https://msdn.microsoft.com/en-us/library/dd942421.aspx + # make sure no_conversion is a list, just to simplify code below: + if no_conversion == None: + no_conversion = [] + # stream path as a string to report exceptions: + streampath = filename + if not isinstance(streampath, str): + streampath = '/'.join(streampath) + + fp = self.openstream(filename) + + data = {} + + try: + # header + s = fp.read(28) + clsid = _clsid(s[8:24]) + + # format id + s = fp.read(20) + fmtid = _clsid(s[:16]) + fp.seek(i32(s, 16)) + + # get section + s = b"****" + fp.read(i32(fp.read(4))-4) + # number of properties: + num_props = i32(s, 4) + except BaseException as exc: + # catch exception while parsing property header, and only raise + # a DEFECT_INCORRECT then return an empty dict, because this is not + # a fatal error when parsing the whole file + msg = 'Error while parsing properties header in stream %s: %s' % ( + repr(streampath), exc) + self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) + return data + + for i in range(num_props): + property_id = 0 # just in case of an exception + try: + property_id = i32(s, 8+i*8) + offset = i32(s, 12+i*8) + property_type = i32(s, offset) + + log.debug('property id=%d: type=%d offset=%X' % (property_id, property_type, offset)) + + # test for common types first (should perhaps use + # a dictionary instead?) + + if property_type == VT_I2: # 16-bit signed integer + value = i16(s, offset+4) + if value >= 32768: + value = value - 65536 + elif property_type == VT_UI2: # 2-byte unsigned integer + value = i16(s, offset+4) + elif property_type in (VT_I4, VT_INT, VT_ERROR): + # VT_I4: 32-bit signed integer + # VT_ERROR: HRESULT, similar to 32-bit signed integer, + # see http://msdn.microsoft.com/en-us/library/cc230330.aspx + value = i32(s, offset+4) + elif property_type in (VT_UI4, VT_UINT): # 4-byte unsigned integer + value = i32(s, offset+4) # FIXME + elif property_type in (VT_BSTR, VT_LPSTR): + # CodePageString, see http://msdn.microsoft.com/en-us/library/dd942354.aspx + # size is a 32 bits integer, including the null terminator, and + # possibly trailing or embedded null chars + #TODO: if codepage is unicode, the string should be converted as such + count = i32(s, offset+4) + value = s[offset+8:offset+8+count-1] + # remove all null chars: + value = value.replace(b'\x00', b'') + elif property_type == VT_BLOB: + # binary large object (BLOB) + # see http://msdn.microsoft.com/en-us/library/dd942282.aspx + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif property_type == VT_LPWSTR: + # UnicodeString + # see http://msdn.microsoft.com/en-us/library/dd942313.aspx + # "the string should NOT contain embedded or additional trailing + # null characters." + count = i32(s, offset+4) + value = self._decode_utf16_str(s[offset+8:offset+8+count*2]) + elif property_type == VT_FILETIME: + value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32) + # FILETIME is a 64-bit int: "number of 100ns periods + # since Jan 1,1601". + if convert_time and property_id not in no_conversion: + log.debug('Converting property #%d to python datetime, value=%d=%fs' + %(property_id, value, float(value)/10000000)) + # convert FILETIME to Python datetime.datetime + # inspired from http://code.activestate.com/recipes/511425-filetime-to-datetime/ + _FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0) + log.debug('timedelta days=%d' % (value//(10*1000000*3600*24))) + value = _FILETIME_null_date + datetime.timedelta(microseconds=value//10) + else: + # legacy code kept for backward compatibility: returns a + # number of seconds since Jan 1,1601 + value = value // 10000000 # seconds + elif property_type == VT_UI1: # 1-byte unsigned integer + value = i8(s[offset+4]) + elif property_type == VT_CLSID: + value = _clsid(s[offset+4:offset+20]) + elif property_type == VT_CF: + # PropertyIdentifier or ClipboardData?? + # see http://msdn.microsoft.com/en-us/library/dd941945.aspx + count = i32(s, offset+4) + value = s[offset+8:offset+8+count] + elif property_type == VT_BOOL: + # VARIANT_BOOL, 16 bits bool, 0x0000=Fals, 0xFFFF=True + # see http://msdn.microsoft.com/en-us/library/cc237864.aspx + value = bool(i16(s, offset+4)) + else: + value = None # everything else yields "None" + log.debug('property id=%d: type=%d not implemented in parser yet' % (property_id, property_type)) + + # missing: VT_EMPTY, VT_NULL, VT_R4, VT_R8, VT_CY, VT_DATE, + # VT_DECIMAL, VT_I1, VT_I8, VT_UI8, + # see http://msdn.microsoft.com/en-us/library/dd942033.aspx + + # FIXME: add support for VT_VECTOR + # VT_VECTOR is a 32 uint giving the number of items, followed by + # the items in sequence. The VT_VECTOR value is combined with the + # type of items, e.g. VT_VECTOR|VT_BSTR + # see http://msdn.microsoft.com/en-us/library/dd942011.aspx + + #print("%08x" % property_id, repr(value), end=" ") + #print("(%s)" % VT[i32(s, offset) & 0xFFF]) + + data[property_id] = value + except BaseException as exc: + # catch exception while parsing each property, and only raise + # a DEFECT_INCORRECT, because parsing can go on + msg = 'Error while parsing property id %d in stream %s: %s' % ( + property_id, repr(streampath), exc) + self._raise_defect(DEFECT_INCORRECT, msg, type(exc)) + + return data + + def get_metadata(self): + """ + Parse standard properties streams, return an OleMetadata object + containing all the available metadata. + (also stored in the metadata attribute of the OleFileIO object) + + new in version 0.25 + """ + self.metadata = OleMetadata() + self.metadata.parse_properties(self) + return self.metadata + +# +# -------------------------------------------------------------------- +# This script can be used to dump the directory of any OLE2 structured +# storage file. + +if __name__ == "__main__": + + import sys, optparse + + DEFAULT_LOG_LEVEL = "warning" # Default log level + LOG_LEVELS = { + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, + 'critical': logging.CRITICAL + } + + usage = 'usage: %prog [options] [filename2 ...]' + parser = optparse.OptionParser(usage=usage) + parser.add_option("-c", action="store_true", dest="check_streams", + help='check all streams (for debugging purposes)') + parser.add_option("-d", action="store_true", dest="debug_mode", + help='debug mode, shortcut for -l debug (displays a lot of debug information, for developers only)') + parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, + help="logging level debug/info/warning/error/critical (default=%default)") + + (options, args) = parser.parse_args() + + print('olefile version %s %s - http://www.decalage.info/en/olefile\n' % (__version__, __date__)) + + # Print help if no arguments are passed + if len(args) == 0: + print(__doc__) + parser.print_help() + sys.exit() + + if options.debug_mode: + options.loglevel = 'debug' + + # setup logging to the console + logging.basicConfig(level=LOG_LEVELS[options.loglevel], format='%(levelname)-8s %(message)s') + + # also enable the module's logger: + enable_logging() + + for filename in args: + try: + ole = OleFileIO(filename)#, raise_defects=DEFECT_INCORRECT) + print("-" * 68) + print(filename) + print("-" * 68) + ole.dumpdirectory() + for streamname in ole.listdir(): + if streamname[-1][0] == "\005": + print("%r: properties" % streamname) + try: + props = ole.getproperties(streamname, convert_time=True) + props = sorted(props.items()) + for k, v in props: + #[PL]: avoid to display too large or binary values: + if isinstance(v, (basestring, bytes)): + if len(v) > 50: + v = v[:50] + if isinstance(v, bytes): + # quick and dirty binary check: + for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30,31): + if c in bytearray(v): + v = '(binary data)' + break + print(" ", k, v) + except: + log.exception('Error while parsing property stream %r' % streamname) + + if options.check_streams: + # Read all streams to check if there are errors: + print('\nChecking streams...') + for streamname in ole.listdir(): + # print name using repr() to convert binary chars to \xNN: + print('-', repr('/'.join(streamname)),'-', end=' ') + st_type = ole.get_type(streamname) + if st_type == STGTY_STREAM: + print('size %d' % ole.get_size(streamname)) + # just try to read stream in memory: + ole.openstream(streamname) + else: + print('NOT a stream : type=%d' % st_type) + print() + +## for streamname in ole.listdir(): +## # print name using repr() to convert binary chars to \xNN: +## print('-', repr('/'.join(streamname)),'-', end=' ') +## print(ole.getmtime(streamname)) +## print() + + print('Modification/Creation times of all directory entries:') + for entry in ole.direntries: + if entry is not None: + print('- %s: mtime=%s ctime=%s' % (entry.name, + entry.getmtime(), entry.getctime())) + print() + + # parse and display metadata: + try: + meta = ole.get_metadata() + meta.dump() + except: + log.exception('Error while parsing metadata') + print() + #[PL] Test a few new methods: + root = ole.get_rootentry_name() + print('Root entry name: "%s"' % root) + if ole.exists('worddocument'): + print("This is a Word document.") + print("type of stream 'WordDocument':", ole.get_type('worddocument')) + print("size :", ole.get_size('worddocument')) + if ole.exists('macros/vba'): + print("This document may contain VBA macros.") + + # print parsing issues: + print('\nNon-fatal issues raised during parsing:') + if ole.parsing_issues: + for exctype, msg in ole.parsing_issues: + print('- %s: %s' % (exctype.__name__, msg)) + else: + print('None') + except: + log.exception('Error while parsing file %r' % filename) + +# this code was developed while listening to The Wedding Present "Sea Monsters" diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.html remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.html --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.html 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.html 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,81 @@ +

olefile (formerly OleFileIO_PL)

olefile is a Python package to parse, read and write Microsoft OLE2 files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), such as Microsoft Office 97-2003 documents, vbaProject.bin in MS Office 2007+ files, Image Composer and FlashPix files, Outlook messages, StickyNotes, several Microscopy file formats, McAfee antivirus quarantine files, etc.

Quick links: Home page - Download/Install - Documentation - Report Issues/Suggestions/Questions - Contact the author - Repository - Updates on Twitter

News

Follow all updates and news on Twitter: https://twitter.com/decalage2

2016-02-02 v0.43: fixed issues #26 and #27, better handling of malformed files, use python logging.
2015-01-25 v0.42: improved handling of special characters in stream/storage names on Python 2.x (using UTF-8 instead of Latin-1), fixed bug in listdir with empty storages.
2014-11-25 v0.41: OleFileIO.open and isOleFile now support OLE files stored in byte strings, fixed installer for python 3, added support for Jython (Niko Ehrenfeuchter)
2014-10-01 v0.40: renamed OleFileIO_PL to olefile, added initial write support for streams >4K, updated doc and license, improved the setup script.
2014-07-27 v0.31: fixed support for large files with 4K sectors, thanks to Niko Ehrenfeuchter, Martijn Berger and Dave Jones. Added test scripts from Pillow (by hugovk). Fixed setup for Python 3 (Martin Panter)
2014-02-04 v0.30: now compatible with Python 3.x, thanks to Martin Panter who did most of the hard work.
2013-07-24 v0.26: added methods to parse stream/storage timestamps, improved listdir to include storages, fixed parsing of direntry timestamps
2013-05-27 v0.25: improved metadata extraction, properties parsing and exception handling, fixed issue #12
2013-05-07 v0.24: new features to extract metadata (get_metadata method and OleMetadata class), improved getproperties to convert timestamps to Python datetime
2012-10-09: published python-oletools, a package of analysis tools based on OleFileIO_PL
2012-09-11 v0.23: added support for file-like objects, fixed issue #8
2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 (added close method)
2011-10-20: code hosted on bitbucket to ease contributions and bug tracking
2010-01-24 v0.21: fixed support for big-endian CPUs, such as PowerPC Macs.
2009-12-11 v0.20: small bugfix in OleFileIO.open when filename is not plain str.
2009-12-10 v0.19: fixed support for 64 bits platforms (thanks to Ben G. and Martijn for reporting the bug)
see changelog in source code for more info.

Download/Install

If you have pip or setuptools installed (pip is included in Python 2.7.9+), you may simply run pip install olefile or easy_install olefile for the first installation.

To update olefile, run pip install -U olefile.

Otherwise, see https://bitbucket.org/decalage/olefileio_pl/wiki/Install

Features

Parse, read and write any OLE file such as Microsoft Office 97-2003 legacy document formats (Word .doc, Excel .xls, PowerPoint .ppt, Visio .vsd, Project .mpp), Image Composer and FlashPix files, Outlook messages, StickyNotes, Zeiss AxioVision ZVI files, Olympus FluoView OIB files, etc
List all the streams and storages contained in an OLE file
Open streams as files
Parse and read property streams, containing metadata of the file
Portable, pure Python module, no dependency

olefile can be used as an independent package or with PIL/Pillow.

History

As far as I know, olefile is the most complete and robust Python implementation to read MS OLE2 files, portable on several operating systems. (please tell me if you know other similar Python modules)

Main improvements over the original version of OleFileIO in PIL:

Compatible with Python 3.x and 2.6+
Many bug fixes
Support for files larger than 6.8MB
Support for 64 bits platforms and big-endian CPUs
Robust: many checks to detect malformed files
Runtime option to choose if malformed files should be parsed or raise exceptions
Improved API
Metadata extraction, stream/storage timestamps (e.g. for document forensics)
Can open file-like objects
Added setup.py and install.bat to ease installation
More convenient slash-based syntax for stream paths
Write features

Documentation

Please see the online documentation for more information, especially the OLE overview and the API page which describe how to use olefile in Python applications. A copy of the same documentation is also provided in the doc subfolder of the olefile package.

Real-life examples

A real-life example: using OleFileIO_PL for malware analysis and forensics.

See also this paper about python tools for forensics, which features olefile.

License

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

olefile is based on source code from the OleFileIO module of the Python Imaging Library (PIL) published by Fredrik Lundh under the following license:

The Python Imaging Library (PIL) is

By obtaining, using, and/or copying this software and/or its associated documentation, you agree that you have read, understood, and will comply with the following terms and conditions:

diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.rst remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.rst --- remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.rst 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/olefile/README.rst 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,226 @@ +olefile (formerly OleFileIO\_PL) +================================ + +`olefile `__ is a Python package to +parse, read and write `Microsoft OLE2 +files `__ +(also called Structured Storage, Compound File Binary Format or Compound +Document File Format), such as Microsoft Office 97-2003 documents, +vbaProject.bin in MS Office 2007+ files, Image Composer and FlashPix +files, Outlook messages, StickyNotes, several Microscopy file formats, +McAfee antivirus quarantine files, etc. + +**Quick links:** `Home page `__ - +`Download/Install `__ +- `Documentation `__ - +`Report +Issues/Suggestions/Questions `__ +- `Contact the author `__ - +`Repository `__ - `Updates +on Twitter `__ + +News +---- + +Follow all updates and news on Twitter: https://twitter.com/decalage2 + +- **2016-02-02 v0.43**: fixed issues + `#26 `__ + and + `#27 `__, + better handling of malformed files, use python logging. +- 2015-01-25 v0.42: improved handling of special characters in + stream/storage names on Python 2.x (using UTF-8 instead of Latin-1), + fixed bug in listdir with empty storages. +- 2014-11-25 v0.41: OleFileIO.open and isOleFile now support OLE files + stored in byte strings, fixed installer for python 3, added support + for Jython (Niko Ehrenfeuchter) +- 2014-10-01 v0.40: renamed OleFileIO\_PL to olefile, added initial + write support for streams >4K, updated doc and license, improved the + setup script. +- 2014-07-27 v0.31: fixed support for large files with 4K sectors, + thanks to Niko Ehrenfeuchter, Martijn Berger and Dave Jones. Added + test scripts from Pillow (by hugovk). Fixed setup for Python 3 + (Martin Panter) +- 2014-02-04 v0.30: now compatible with Python 3.x, thanks to Martin + Panter who did most of the hard work. +- 2013-07-24 v0.26: added methods to parse stream/storage timestamps, + improved listdir to include storages, fixed parsing of direntry + timestamps +- 2013-05-27 v0.25: improved metadata extraction, properties parsing + and exception handling, fixed `issue + #12 `__ +- 2013-05-07 v0.24: new features to extract metadata (get\_metadata + method and OleMetadata class), improved getproperties to convert + timestamps to Python datetime +- 2012-10-09: published + `python-oletools `__, a + package of analysis tools based on OleFileIO\_PL +- 2012-09-11 v0.23: added support for file-like objects, fixed `issue + #8 `__ +- 2012-02-17 v0.22: fixed issues #7 (bug in getproperties) and #2 + (added close method) +- 2011-10-20: code hosted on bitbucket to ease contributions and bug + tracking +- 2010-01-24 v0.21: fixed support for big-endian CPUs, such as PowerPC + Macs. +- 2009-12-11 v0.20: small bugfix in OleFileIO.open when filename is not + plain str. +- 2009-12-10 v0.19: fixed support for 64 bits platforms (thanks to Ben + G. and Martijn for reporting the bug) +- see changelog in source code for more info. + +Download/Install +---------------- + +If you have pip or setuptools installed (pip is included in Python +2.7.9+), you may simply run **pip install olefile** or **easy\_install +olefile** for the first installation. + +To update olefile, run **pip install -U olefile**. + +Otherwise, see https://bitbucket.org/decalage/olefileio\_pl/wiki/Install + +Features +-------- + +- Parse, read and write any OLE file such as Microsoft Office 97-2003 + legacy document formats (Word .doc, Excel .xls, PowerPoint .ppt, + Visio .vsd, Project .mpp), Image Composer and FlashPix files, Outlook + messages, StickyNotes, Zeiss AxioVision ZVI files, Olympus FluoView + OIB files, etc +- List all the streams and storages contained in an OLE file +- Open streams as files +- Parse and read property streams, containing metadata of the file +- Portable, pure Python module, no dependency + +olefile can be used as an independent package or with PIL/Pillow. + +olefile is mostly meant for developers. If you are looking for tools to +analyze OLE files or to extract data (especially for security purposes +such as malware analysis and forensics), then please also check my +`python-oletools `__, which +are built upon olefile and provide a higher-level interface. + +History +------- + +olefile is based on the OleFileIO module from +`PIL `__, the +excellent Python Imaging Library, created and maintained by Fredrik +Lundh. The olefile API is still compatible with PIL, but since 2005 I +have improved the internal implementation significantly, with new +features, bugfixes and a more robust design. From 2005 to 2014 the +project was called OleFileIO\_PL, and in 2014 I changed its name to +olefile to celebrate its 9 years and its new write features. + +As far as I know, olefile is the most complete and robust Python +implementation to read MS OLE2 files, portable on several operating +systems. (please tell me if you know other similar Python modules) + +Since 2014 olefile/OleFileIO\_PL has been integrated into +`Pillow `__, the friendly fork of PIL. +olefile will continue to be improved as a separate project, and new +versions will be merged into Pillow regularly. + +Main improvements over the original version of OleFileIO in PIL: +---------------------------------------------------------------- + +- Compatible with Python 3.x and 2.6+ +- Many bug fixes +- Support for files larger than 6.8MB +- Support for 64 bits platforms and big-endian CPUs +- Robust: many checks to detect malformed files +- Runtime option to choose if malformed files should be parsed or raise + exceptions +- Improved API +- Metadata extraction, stream/storage timestamps (e.g. for document + forensics) +- Can open file-like objects +- Added setup.py and install.bat to ease installation +- More convenient slash-based syntax for stream paths +- Write features + +Documentation +------------- + +Please see the `online +documentation `__ for +more information, especially the `OLE +overview `__ +and the `API +page `__ which +describe how to use olefile in Python applications. A copy of the same +documentation is also provided in the doc subfolder of the olefile +package. + +Real-life examples +------------------ + +A real-life example: `using OleFileIO\_PL for malware analysis and +forensics `__. + +See also `this +paper `__ +about python tools for forensics, which features olefile. + +License +------- + +olefile (formerly OleFileIO\_PL) is copyright (c) 2005-2016 Philippe +Lagadec (http://www.decalage.info) + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------- + +olefile is based on source code from the OleFileIO module of the Python +Imaging Library (PIL) published by Fredrik Lundh under the following +license: + +The Python Imaging Library (PIL) is + +- Copyright (c) 1997-2005 by Secret Labs AB +- Copyright (c) 1995-2005 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its associated +documentation, you agree that you have read, understood, and will comply +with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appears in all copies, +and that both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Secret Labs AB or the +author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF +CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/CHANGELOG remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/CHANGELOG --- remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/CHANGELOG 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/CHANGELOG 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,142 @@ +########## PrettyTable 0.7 - Feb 17, 2013 ########### + +* Improved Python 2 and 3 compatibility (2.4-3.2). +* Improved support for non-Latin characters. Table widths should + now be calculated correctly for tables with e.g. Japanese text. +* Table contents can now be read in from a .csv file +* Table contents can now be read in from a DB-API compatible cursor +* Table contents can now be read in from a string containing a + HTML table (thanks to Christoph Robbert for submitting this patch!) +* new valign attribute controls vertical alignment of text when + some cells in a row have multiple lines of text and others don't. + (thanks to Google Code user maartendb for submitting this patch!) +* hrules attribute can now be set to HEADER, which draws a rule only + under the header row +* new vrules attribute controls drawing of vertical rules and can + be set to FRAME, ALL or NONE +* new header_style attribute controls formatting of text in table + headers and can be set to "cap", "title", "upper", "lower" or None +* Fixed a simple bug regarding validation of max_width (thanks to + Anthony Toole for pointing out this bug and providing a patch). +* Fixed a simple bug regarding initialisation of int_format value + for new tables (thanks to Ingo Schmiegel for pointing out this + bug!) +* Fixed a bug regarding some constructor keywords, such as "border", + being ignored (thanks to Google Code user antonio.s.messina for + reporting this bug). + +########## PrettyTable 0.6 - May 5, 2012 ########## + +* Code is now simultaneously compatible with Python 2 and 3 +* Replaced all setter methods with managed attributes +* All styling options can now be set persistently as managed attributes +* Added "add_style" method to make setting style options easily +* Added "del_row", "clear_rows" and "clear" methods to facilitate + removal of data from table. +* Added "copy" method to facilitate cloning of a table. +* Removed caching functionality, which added complexity and fragility + for relatively little gain +* Removed methods that just printed strings produced by get_string and + get_html_string - just use inbuilt print! +* Improved unicode support (thanks to Google Code user ru.w31rd0 for + patch!) +* Added support for decimal and floating point number formatting + support (thanks to Google Code user willfurnass for the suggestion!) +* Added support for using a custom key sorting methods (thanks to + Google Code user amannijhawan for the suggestion!) +* Added support for line breaks in data (suggested and implemented by + Klein Stephane) +* Added support for max column widths (thanks to Tibor Arpas for the + suggestion!) +* Fixed table slicing +* Fixed bug where closing tags in HTML tables were not printed + (thanks to Google Code user kehander for reporting this bug!) +* Fixed HTML table sorting bug (thanks to Google Code user dougbeal + for reporting this bug!) +* Fixed bug whereby changing field_names did not recompute widths + (thanks to Google Code user denilsonsa for reporting this bug!) + +########## PrettyTable 0.5 - May 26, 2009 ########## + +* Fixed a bug whereby printing with headers=False and border=False + would introduce an extraneous newline. Thanks to Alexander Lamaison + for reporting this bug. +* When printing with headers=False, column widths will now be reduced + as appropriate in columns where the field name is wider than the + data. Thanks to Alexander Lamaison for suggesting this behaviour. +* Support for Unicode has improved. Thanks to Chris Clark for + submitting this improvement. +* The value of the "border" argument now correctly controls the + presence of a border when printing HTML tables with print_html or + get_html_string, instead of being incorrectly ignored. Thanks to + Chris Clark for fixing this. +* The print_html and get_html_string methods now accept an + "attributes" argument which is a dictionary of name/value pairs to be + placed inside the tag (so you can, e.g. set class, name or id + values in order to style your table with CSS). Thanks to Chris Clark + for submitting this feature. +* The print_html and get_html_string methods now, by default, do their + best to match the various formatting options in their HTML output. + They use inline CSS to adjust the alignment of data in columns, the + padding widths of columns and in some cases the border settings. You + can give either method a "format=False" attribute to turn this + behaviour off if you want to do your own styling. With "format=False" + the methods print a "bare bones" table, similar to the default + behaviour in 0.4. + +########## PrettyTable 0.4 - May 13, 2009 ########## + +* Added "add_column" method to enable building tables up column-by-column. +* Added "print_HTML" and "get_HTML_string" methods to enable HTML table + production. +* Added "set_border_chars" method to enable control over characters used to + draw the table border. +* Added "set_left_padding" and "set_right_padding" methods to allow + independent padding control for both sides of a column. +* Added "sortby" option to enable column sorting. +* Added "header" option to enable switching off field name printing at top of + table. +* Modified "hrules" option to enable greater control over presence of + horizontal lines. +* Added "border" option to enable switching off all line printing. + +Thanks to Tim Cera, Chris Clark, Alexander Lamaison for suggesting and helping +to test many of the new features in this release. + +########## PrettyTable 0.3 - May 01, 2009 ########## + +* Added "padding_width" option to control the number of spaces between the + vertical line rules at the edges of a column and its content. This can be + set as a keyword argument to the constructor or after instantiation using + the "set_padding_width" method. The value is set to 1 by defaut. If your + table is too wide for a small screen with this value, setting it to 0 might + help you squeeze it in. + +Thanks to Chris Clark for contributing a patch against 0.2.1 to add this +feature! + +########## PrettyTable 0.2.1 - April 29, 2009 ########## + +* Caching no longer breaks when using the "printt(fields=[...])" syntax. The + list of fields was not hashable and hence could not be used as a dictionary + key. I fixed this using the output of the "cPickle" module's "dumps" + function as the dictionary key instead. +* Horizontal lines are now the appropriate length when the above syntax is + used. + +Thanks to Julien Koesten for reporting these bugs and testing the fixes almost +immediately after the release of 0.2! + +########## PrettyTable 0.2 - April 29, 2009 ########## + +* Added "get_string" method. +* Added "__str__" method (which just calls "get_string") to enable nice + "print x" syntax. +* Can now pass field names as a constructor argument. +* Return values of "get_string" are cached in a dictionary that is only + cleared after a call to "add_row" or something else which invalidates the + cache. + +########## PrettyTable 0.1 - February 26, 2009 ######### + +* Original release diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/COPYING remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/COPYING --- remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/COPYING 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/COPYING 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,30 @@ +# Copyright (c) 2009-2013 Luke Maurits +# All rights reserved. +# With contributions from: +# * Chris Clark +# * Christoph Robbert +# * Klein Stephane +# * "maartendb" +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/prettytable.py remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/prettytable.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/prettytable.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/prettytable.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,1475 @@ +#!/usr/bin/env python +# +# Copyright (c) 2009-2013, Luke Maurits +# All rights reserved. +# With contributions from: +# * Chris Clark +# * Klein Stephane +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +__version__ = "0.7.2" + +import copy +import csv +import random +import re +import sys +import textwrap +import itertools +import unicodedata + +py3k = sys.version_info[0] >= 3 +if py3k: + unicode = str + basestring = str + itermap = map + iterzip = zip + uni_chr = chr + from html.parser import HTMLParser +else: + itermap = itertools.imap + iterzip = itertools.izip + uni_chr = unichr + from HTMLParser import HTMLParser + +if py3k and sys.version_info[1] >= 2: + from html import escape +else: + from cgi import escape + +# hrule styles +FRAME = 0 +ALL = 1 +NONE = 2 +HEADER = 3 + +# Table styles +DEFAULT = 10 +MSWORD_FRIENDLY = 11 +PLAIN_COLUMNS = 12 +RANDOM = 20 + +_re = re.compile("\033\[[0-9;]*m") + +def _get_size(text): + lines = text.split("\n") + height = len(lines) + width = max([_str_block_width(line) for line in lines]) + return (width, height) + +class PrettyTable(object): + + def __init__(self, field_names=None, **kwargs): + + """Return a new PrettyTable instance + + Arguments: + + encoding - Unicode encoding scheme used to decode any encoded input + field_names - list or tuple of field names + fields - list or tuple of field names to include in displays + start - index of first data row to include in output + end - index of last data row to include in output PLUS ONE (list slice style) + header - print a header showing field names (True or False) + header_style - stylisation to apply to field names in header ("cap", "title", "upper", "lower" or None) + border - print a border around the table (True or False) + hrules - controls printing of horizontal rules after rows. Allowed values: FRAME, HEADER, ALL, NONE + vrules - controls printing of vertical rules between columns. Allowed values: FRAME, ALL, NONE + int_format - controls formatting of integer data + float_format - controls formatting of floating point data + padding_width - number of spaces on either side of column data (only used if left and right paddings are None) + left_padding_width - number of spaces on left hand side of column data + right_padding_width - number of spaces on right hand side of column data + vertical_char - single character string used to draw vertical lines + horizontal_char - single character string used to draw horizontal lines + junction_char - single character string used to draw line junctions + sortby - name of field to sort rows by + sort_key - sorting key function, applied to data points before sorting + valign - default valign for each row (None, "t", "m" or "b") + reversesort - True or False to sort in descending or ascending order""" + + self.encoding = kwargs.get("encoding", "UTF-8") + + # Data + self._field_names = [] + self._align = {} + self._valign = {} + self._max_width = {} + self._rows = [] + if field_names: + self.field_names = field_names + else: + self._widths = [] + + # Options + self._options = "start end fields header border sortby reversesort sort_key attributes format hrules vrules".split() + self._options.extend("int_format float_format padding_width left_padding_width right_padding_width".split()) + self._options.extend("vertical_char horizontal_char junction_char header_style valign xhtml print_empty".split()) + for option in self._options: + if option in kwargs: + self._validate_option(option, kwargs[option]) + else: + kwargs[option] = None + + self._start = kwargs["start"] or 0 + self._end = kwargs["end"] or None + self._fields = kwargs["fields"] or None + + if kwargs["header"] in (True, False): + self._header = kwargs["header"] + else: + self._header = True + self._header_style = kwargs["header_style"] or None + if kwargs["border"] in (True, False): + self._border = kwargs["border"] + else: + self._border = True + self._hrules = kwargs["hrules"] or FRAME + self._vrules = kwargs["vrules"] or ALL + + self._sortby = kwargs["sortby"] or None + if kwargs["reversesort"] in (True, False): + self._reversesort = kwargs["reversesort"] + else: + self._reversesort = False + self._sort_key = kwargs["sort_key"] or (lambda x: x) + + self._int_format = kwargs["int_format"] or {} + self._float_format = kwargs["float_format"] or {} + self._padding_width = kwargs["padding_width"] or 1 + self._left_padding_width = kwargs["left_padding_width"] or None + self._right_padding_width = kwargs["right_padding_width"] or None + + self._vertical_char = kwargs["vertical_char"] or self._unicode("|") + self._horizontal_char = kwargs["horizontal_char"] or self._unicode("-") + self._junction_char = kwargs["junction_char"] or self._unicode("+") + + if kwargs["print_empty"] in (True, False): + self._print_empty = kwargs["print_empty"] + else: + self._print_empty = True + self._format = kwargs["format"] or False + self._xhtml = kwargs["xhtml"] or False + self._attributes = kwargs["attributes"] or {} + + def _unicode(self, value): + if not isinstance(value, basestring): + value = str(value) + if not isinstance(value, unicode): + value = unicode(value, self.encoding, "strict") + return value + + def _justify(self, text, width, align): + excess = width - _str_block_width(text) + if align == "l": + return text + excess * " " + elif align == "r": + return excess * " " + text + else: + if excess % 2: + # Uneven padding + # Put more space on right if text is of odd length... + if _str_block_width(text) % 2: + return (excess//2)*" " + text + (excess//2 + 1)*" " + # and more space on left if text is of even length + else: + return (excess//2 + 1)*" " + text + (excess//2)*" " + # Why distribute extra space this way? To match the behaviour of + # the inbuilt str.center() method. + else: + # Equal padding on either side + return (excess//2)*" " + text + (excess//2)*" " + + def __getattr__(self, name): + + if name == "rowcount": + return len(self._rows) + elif name == "colcount": + if self._field_names: + return len(self._field_names) + elif self._rows: + return len(self._rows[0]) + else: + return 0 + else: + raise AttributeError(name) + + def __getitem__(self, index): + + new = PrettyTable() + new.field_names = self.field_names + for attr in self._options: + setattr(new, "_"+attr, getattr(self, "_"+attr)) + setattr(new, "_align", getattr(self, "_align")) + if isinstance(index, slice): + for row in self._rows[index]: + new.add_row(row) + elif isinstance(index, int): + new.add_row(self._rows[index]) + else: + raise Exception("Index %s is invalid, must be an integer or slice" % str(index)) + return new + + if py3k: + def __str__(self): + return self.__unicode__() + else: + def __str__(self): + return self.__unicode__().encode(self.encoding) + + def __unicode__(self): + return self.get_string() + + ############################## + # ATTRIBUTE VALIDATORS # + ############################## + + # The method _validate_option is all that should be used elsewhere in the code base to validate options. + # It will call the appropriate validation method for that option. The individual validation methods should + # never need to be called directly (although nothing bad will happen if they *are*). + # Validation happens in TWO places. + # Firstly, in the property setters defined in the ATTRIBUTE MANAGMENT section. + # Secondly, in the _get_options method, where keyword arguments are mixed with persistent settings + + def _validate_option(self, option, val): + if option in ("field_names"): + self._validate_field_names(val) + elif option in ("start", "end", "max_width", "padding_width", "left_padding_width", "right_padding_width", "format"): + self._validate_nonnegative_int(option, val) + elif option in ("sortby"): + self._validate_field_name(option, val) + elif option in ("sort_key"): + self._validate_function(option, val) + elif option in ("hrules"): + self._validate_hrules(option, val) + elif option in ("vrules"): + self._validate_vrules(option, val) + elif option in ("fields"): + self._validate_all_field_names(option, val) + elif option in ("header", "border", "reversesort", "xhtml", "print_empty"): + self._validate_true_or_false(option, val) + elif option in ("header_style"): + self._validate_header_style(val) + elif option in ("int_format"): + self._validate_int_format(option, val) + elif option in ("float_format"): + self._validate_float_format(option, val) + elif option in ("vertical_char", "horizontal_char", "junction_char"): + self._validate_single_char(option, val) + elif option in ("attributes"): + self._validate_attributes(option, val) + else: + raise Exception("Unrecognised option: %s!" % option) + + def _validate_field_names(self, val): + # Check for appropriate length + if self._field_names: + try: + assert len(val) == len(self._field_names) + except AssertionError: + raise Exception("Field name list has incorrect number of values, (actual) %d!=%d (expected)" % (len(val), len(self._field_names))) + if self._rows: + try: + assert len(val) == len(self._rows[0]) + except AssertionError: + raise Exception("Field name list has incorrect number of values, (actual) %d!=%d (expected)" % (len(val), len(self._rows[0]))) + # Check for uniqueness + try: + assert len(val) == len(set(val)) + except AssertionError: + raise Exception("Field names must be unique!") + + def _validate_header_style(self, val): + try: + assert val in ("cap", "title", "upper", "lower", None) + except AssertionError: + raise Exception("Invalid header style, use cap, title, upper, lower or None!") + + def _validate_align(self, val): + try: + assert val in ["l","c","r"] + except AssertionError: + raise Exception("Alignment %s is invalid, use l, c or r!" % val) + + def _validate_valign(self, val): + try: + assert val in ["t","m","b",None] + except AssertionError: + raise Exception("Alignment %s is invalid, use t, m, b or None!" % val) + + def _validate_nonnegative_int(self, name, val): + try: + assert int(val) >= 0 + except AssertionError: + raise Exception("Invalid value for %s: %s!" % (name, self._unicode(val))) + + def _validate_true_or_false(self, name, val): + try: + assert val in (True, False) + except AssertionError: + raise Exception("Invalid value for %s! Must be True or False." % name) + + def _validate_int_format(self, name, val): + if val == "": + return + try: + assert type(val) in (str, unicode) + assert val.isdigit() + except AssertionError: + raise Exception("Invalid value for %s! Must be an integer format string." % name) + + def _validate_float_format(self, name, val): + if val == "": + return + try: + assert type(val) in (str, unicode) + assert "." in val + bits = val.split(".") + assert len(bits) <= 2 + assert bits[0] == "" or bits[0].isdigit() + assert bits[1] == "" or bits[1].isdigit() + except AssertionError: + raise Exception("Invalid value for %s! Must be a float format string." % name) + + def _validate_function(self, name, val): + try: + assert hasattr(val, "__call__") + except AssertionError: + raise Exception("Invalid value for %s! Must be a function." % name) + + def _validate_hrules(self, name, val): + try: + assert val in (ALL, FRAME, HEADER, NONE) + except AssertionError: + raise Exception("Invalid value for %s! Must be ALL, FRAME, HEADER or NONE." % name) + + def _validate_vrules(self, name, val): + try: + assert val in (ALL, FRAME, NONE) + except AssertionError: + raise Exception("Invalid value for %s! Must be ALL, FRAME, or NONE." % name) + + def _validate_field_name(self, name, val): + try: + assert (val in self._field_names) or (val is None) + except AssertionError: + raise Exception("Invalid field name: %s!" % val) + + def _validate_all_field_names(self, name, val): + try: + for x in val: + self._validate_field_name(name, x) + except AssertionError: + raise Exception("fields must be a sequence of field names!") + + def _validate_single_char(self, name, val): + try: + assert _str_block_width(val) == 1 + except AssertionError: + raise Exception("Invalid value for %s! Must be a string of length 1." % name) + + def _validate_attributes(self, name, val): + try: + assert isinstance(val, dict) + except AssertionError: + raise Exception("attributes must be a dictionary of name/value pairs!") + + ############################## + # ATTRIBUTE MANAGEMENT # + ############################## + + def _get_field_names(self): + return self._field_names + """The names of the fields + + Arguments: + + fields - list or tuple of field names""" + def _set_field_names(self, val): + val = [self._unicode(x) for x in val] + self._validate_option("field_names", val) + if self._field_names: + old_names = self._field_names[:] + self._field_names = val + if self._align and old_names: + for old_name, new_name in zip(old_names, val): + self._align[new_name] = self._align[old_name] + for old_name in old_names: + if old_name not in self._align: + self._align.pop(old_name) + else: + for field in self._field_names: + self._align[field] = "c" + if self._valign and old_names: + for old_name, new_name in zip(old_names, val): + self._valign[new_name] = self._valign[old_name] + for old_name in old_names: + if old_name not in self._valign: + self._valign.pop(old_name) + else: + for field in self._field_names: + self._valign[field] = "t" + field_names = property(_get_field_names, _set_field_names) + + def _get_align(self): + return self._align + def _set_align(self, val): + self._validate_align(val) + for field in self._field_names: + self._align[field] = val + align = property(_get_align, _set_align) + + def _get_valign(self): + return self._valign + def _set_valign(self, val): + self._validate_valign(val) + for field in self._field_names: + self._valign[field] = val + valign = property(_get_valign, _set_valign) + + def _get_max_width(self): + return self._max_width + def _set_max_width(self, val): + self._validate_option("max_width", val) + for field in self._field_names: + self._max_width[field] = val + max_width = property(_get_max_width, _set_max_width) + + def _get_fields(self): + """List or tuple of field names to include in displays + + Arguments: + + fields - list or tuple of field names to include in displays""" + return self._fields + def _set_fields(self, val): + self._validate_option("fields", val) + self._fields = val + fields = property(_get_fields, _set_fields) + + def _get_start(self): + """Start index of the range of rows to print + + Arguments: + + start - index of first data row to include in output""" + return self._start + + def _set_start(self, val): + self._validate_option("start", val) + self._start = val + start = property(_get_start, _set_start) + + def _get_end(self): + """End index of the range of rows to print + + Arguments: + + end - index of last data row to include in output PLUS ONE (list slice style)""" + return self._end + def _set_end(self, val): + self._validate_option("end", val) + self._end = val + end = property(_get_end, _set_end) + + def _get_sortby(self): + """Name of field by which to sort rows + + Arguments: + + sortby - field name to sort by""" + return self._sortby + def _set_sortby(self, val): + self._validate_option("sortby", val) + self._sortby = val + sortby = property(_get_sortby, _set_sortby) + + def _get_reversesort(self): + """Controls direction of sorting (ascending vs descending) + + Arguments: + + reveresort - set to True to sort by descending order, or False to sort by ascending order""" + return self._reversesort + def _set_reversesort(self, val): + self._validate_option("reversesort", val) + self._reversesort = val + reversesort = property(_get_reversesort, _set_reversesort) + + def _get_sort_key(self): + """Sorting key function, applied to data points before sorting + + Arguments: + + sort_key - a function which takes one argument and returns something to be sorted""" + return self._sort_key + def _set_sort_key(self, val): + self._validate_option("sort_key", val) + self._sort_key = val + sort_key = property(_get_sort_key, _set_sort_key) + + def _get_header(self): + """Controls printing of table header with field names + + Arguments: + + header - print a header showing field names (True or False)""" + return self._header + def _set_header(self, val): + self._validate_option("header", val) + self._header = val + header = property(_get_header, _set_header) + + def _get_header_style(self): + """Controls stylisation applied to field names in header + + Arguments: + + header_style - stylisation to apply to field names in header ("cap", "title", "upper", "lower" or None)""" + return self._header_style + def _set_header_style(self, val): + self._validate_header_style(val) + self._header_style = val + header_style = property(_get_header_style, _set_header_style) + + def _get_border(self): + """Controls printing of border around table + + Arguments: + + border - print a border around the table (True or False)""" + return self._border + def _set_border(self, val): + self._validate_option("border", val) + self._border = val + border = property(_get_border, _set_border) + + def _get_hrules(self): + """Controls printing of horizontal rules after rows + + Arguments: + + hrules - horizontal rules style. Allowed values: FRAME, ALL, HEADER, NONE""" + return self._hrules + def _set_hrules(self, val): + self._validate_option("hrules", val) + self._hrules = val + hrules = property(_get_hrules, _set_hrules) + + def _get_vrules(self): + """Controls printing of vertical rules between columns + + Arguments: + + vrules - vertical rules style. Allowed values: FRAME, ALL, NONE""" + return self._vrules + def _set_vrules(self, val): + self._validate_option("vrules", val) + self._vrules = val + vrules = property(_get_vrules, _set_vrules) + + def _get_int_format(self): + """Controls formatting of integer data + Arguments: + + int_format - integer format string""" + return self._int_format + def _set_int_format(self, val): +# self._validate_option("int_format", val) + for field in self._field_names: + self._int_format[field] = val + int_format = property(_get_int_format, _set_int_format) + + def _get_float_format(self): + """Controls formatting of floating point data + Arguments: + + float_format - floating point format string""" + return self._float_format + def _set_float_format(self, val): +# self._validate_option("float_format", val) + for field in self._field_names: + self._float_format[field] = val + float_format = property(_get_float_format, _set_float_format) + + def _get_padding_width(self): + """The number of empty spaces between a column's edge and its content + + Arguments: + + padding_width - number of spaces, must be a positive integer""" + return self._padding_width + def _set_padding_width(self, val): + self._validate_option("padding_width", val) + self._padding_width = val + padding_width = property(_get_padding_width, _set_padding_width) + + def _get_left_padding_width(self): + """The number of empty spaces between a column's left edge and its content + + Arguments: + + left_padding - number of spaces, must be a positive integer""" + return self._left_padding_width + def _set_left_padding_width(self, val): + self._validate_option("left_padding_width", val) + self._left_padding_width = val + left_padding_width = property(_get_left_padding_width, _set_left_padding_width) + + def _get_right_padding_width(self): + """The number of empty spaces between a column's right edge and its content + + Arguments: + + right_padding - number of spaces, must be a positive integer""" + return self._right_padding_width + def _set_right_padding_width(self, val): + self._validate_option("right_padding_width", val) + self._right_padding_width = val + right_padding_width = property(_get_right_padding_width, _set_right_padding_width) + + def _get_vertical_char(self): + """The charcter used when printing table borders to draw vertical lines + + Arguments: + + vertical_char - single character string used to draw vertical lines""" + return self._vertical_char + def _set_vertical_char(self, val): + val = self._unicode(val) + self._validate_option("vertical_char", val) + self._vertical_char = val + vertical_char = property(_get_vertical_char, _set_vertical_char) + + def _get_horizontal_char(self): + """The charcter used when printing table borders to draw horizontal lines + + Arguments: + + horizontal_char - single character string used to draw horizontal lines""" + return self._horizontal_char + def _set_horizontal_char(self, val): + val = self._unicode(val) + self._validate_option("horizontal_char", val) + self._horizontal_char = val + horizontal_char = property(_get_horizontal_char, _set_horizontal_char) + + def _get_junction_char(self): + """The charcter used when printing table borders to draw line junctions + + Arguments: + + junction_char - single character string used to draw line junctions""" + return self._junction_char + def _set_junction_char(self, val): + val = self._unicode(val) + self._validate_option("vertical_char", val) + self._junction_char = val + junction_char = property(_get_junction_char, _set_junction_char) + + def _get_format(self): + """Controls whether or not HTML tables are formatted to match styling options + + Arguments: + + format - True or False""" + return self._format + def _set_format(self, val): + self._validate_option("format", val) + self._format = val + format = property(_get_format, _set_format) + + def _get_print_empty(self): + """Controls whether or not empty tables produce a header and frame or just an empty string + + Arguments: + + print_empty - True or False""" + return self._print_empty + def _set_print_empty(self, val): + self._validate_option("print_empty", val) + self._print_empty = val + print_empty = property(_get_print_empty, _set_print_empty) + + def _get_attributes(self): + """A dictionary of HTML attribute name/value pairs to be included in the

tag when printing HTML + + Arguments: + + attributes - dictionary of attributes""" + return self._attributes + def _set_attributes(self, val): + self._validate_option("attributes", val) + self._attributes = val + attributes = property(_get_attributes, _set_attributes) + + ############################## + # OPTION MIXER # + ############################## + + def _get_options(self, kwargs): + + options = {} + for option in self._options: + if option in kwargs: + self._validate_option(option, kwargs[option]) + options[option] = kwargs[option] + else: + options[option] = getattr(self, "_"+option) + return options + + ############################## + # PRESET STYLE LOGIC # + ############################## + + def set_style(self, style): + + if style == DEFAULT: + self._set_default_style() + elif style == MSWORD_FRIENDLY: + self._set_msword_style() + elif style == PLAIN_COLUMNS: + self._set_columns_style() + elif style == RANDOM: + self._set_random_style() + else: + raise Exception("Invalid pre-set style!") + + def _set_default_style(self): + + self.header = True + self.border = True + self._hrules = FRAME + self._vrules = ALL + self.padding_width = 1 + self.left_padding_width = 1 + self.right_padding_width = 1 + self.vertical_char = "|" + self.horizontal_char = "-" + self.junction_char = "+" + + def _set_msword_style(self): + + self.header = True + self.border = True + self._hrules = NONE + self.padding_width = 1 + self.left_padding_width = 1 + self.right_padding_width = 1 + self.vertical_char = "|" + + def _set_columns_style(self): + + self.header = True + self.border = False + self.padding_width = 1 + self.left_padding_width = 0 + self.right_padding_width = 8 + + def _set_random_style(self): + + # Just for fun! + self.header = random.choice((True, False)) + self.border = random.choice((True, False)) + self._hrules = random.choice((ALL, FRAME, HEADER, NONE)) + self._vrules = random.choice((ALL, FRAME, NONE)) + self.left_padding_width = random.randint(0,5) + self.right_padding_width = random.randint(0,5) + self.vertical_char = random.choice("~!@#$%^&*()_+|-=\{}[];':\",./;<>?") + self.horizontal_char = random.choice("~!@#$%^&*()_+|-=\{}[];':\",./;<>?") + self.junction_char = random.choice("~!@#$%^&*()_+|-=\{}[];':\",./;<>?") + + ############################## + # DATA INPUT METHODS # + ############################## + + def add_row(self, row): + + """Add a row to the table + + Arguments: + + row - row of data, should be a list with as many elements as the table + has fields""" + + if self._field_names and len(row) != len(self._field_names): + raise Exception("Row has incorrect number of values, (actual) %d!=%d (expected)" %(len(row),len(self._field_names))) + if not self._field_names: + self.field_names = [("Field %d" % (n+1)) for n in range(0,len(row))] + self._rows.append(list(row)) + + def del_row(self, row_index): + + """Delete a row to the table + + Arguments: + + row_index - The index of the row you want to delete. Indexing starts at 0.""" + + if row_index > len(self._rows)-1: + raise Exception("Cant delete row at index %d, table only has %d rows!" % (row_index, len(self._rows))) + del self._rows[row_index] + + def add_column(self, fieldname, column, align="c", valign="t"): + + """Add a column to the table. + + Arguments: + + fieldname - name of the field to contain the new column of data + column - column of data, should be a list with as many elements as the + table has rows + align - desired alignment for this column - "l" for left, "c" for centre and "r" for right + valign - desired vertical alignment for new columns - "t" for top, "m" for middle and "b" for bottom""" + + if len(self._rows) in (0, len(column)): + self._validate_align(align) + self._validate_valign(valign) + self._field_names.append(fieldname) + self._align[fieldname] = align + self._valign[fieldname] = valign + for i in range(0, len(column)): + if len(self._rows) < i+1: + self._rows.append([]) + self._rows[i].append(column[i]) + else: + raise Exception("Column length %d does not match number of rows %d!" % (len(column), len(self._rows))) + + def clear_rows(self): + + """Delete all rows from the table but keep the current field names""" + + self._rows = [] + + def clear(self): + + """Delete all rows and field names from the table, maintaining nothing but styling options""" + + self._rows = [] + self._field_names = [] + self._widths = [] + + ############################## + # MISC PUBLIC METHODS # + ############################## + + def copy(self): + return copy.deepcopy(self) + + ############################## + # MISC PRIVATE METHODS # + ############################## + + def _format_value(self, field, value): + if isinstance(value, int) and field in self._int_format: + value = self._unicode(("%%%sd" % self._int_format[field]) % value) + elif isinstance(value, float) and field in self._float_format: + value = self._unicode(("%%%sf" % self._float_format[field]) % value) + return self._unicode(value) + + def _compute_widths(self, rows, options): + if options["header"]: + widths = [_get_size(field)[0] for field in self._field_names] + else: + widths = len(self.field_names) * [0] + for row in rows: + for index, value in enumerate(row): + fieldname = self.field_names[index] + if fieldname in self.max_width: + widths[index] = max(widths[index], min(_get_size(value)[0], self.max_width[fieldname])) + else: + widths[index] = max(widths[index], _get_size(value)[0]) + self._widths = widths + + def _get_padding_widths(self, options): + + if options["left_padding_width"] is not None: + lpad = options["left_padding_width"] + else: + lpad = options["padding_width"] + if options["right_padding_width"] is not None: + rpad = options["right_padding_width"] + else: + rpad = options["padding_width"] + return lpad, rpad + + def _get_rows(self, options): + """Return only those data rows that should be printed, based on slicing and sorting. + + Arguments: + + options - dictionary of option settings.""" + + # Make a copy of only those rows in the slice range + rows = copy.deepcopy(self._rows[options["start"]:options["end"]]) + # Sort if necessary + if options["sortby"]: + sortindex = self._field_names.index(options["sortby"]) + # Decorate + rows = [[row[sortindex]]+row for row in rows] + # Sort + rows.sort(reverse=options["reversesort"], key=options["sort_key"]) + # Undecorate + rows = [row[1:] for row in rows] + return rows + + def _format_row(self, row, options): + return [self._format_value(field, value) for (field, value) in zip(self._field_names, row)] + + def _format_rows(self, rows, options): + return [self._format_row(row, options) for row in rows] + + ############################## + # PLAIN TEXT STRING METHODS # + ############################## + + def get_string(self, **kwargs): + + """Return string representation of table in current state. + + Arguments: + + start - index of first data row to include in output + end - index of last data row to include in output PLUS ONE (list slice style) + fields - names of fields (columns) to include + header - print a header showing field names (True or False) + border - print a border around the table (True or False) + hrules - controls printing of horizontal rules after rows. Allowed values: ALL, FRAME, HEADER, NONE + vrules - controls printing of vertical rules between columns. Allowed values: FRAME, ALL, NONE + int_format - controls formatting of integer data + float_format - controls formatting of floating point data + padding_width - number of spaces on either side of column data (only used if left and right paddings are None) + left_padding_width - number of spaces on left hand side of column data + right_padding_width - number of spaces on right hand side of column data + vertical_char - single character string used to draw vertical lines + horizontal_char - single character string used to draw horizontal lines + junction_char - single character string used to draw line junctions + sortby - name of field to sort rows by + sort_key - sorting key function, applied to data points before sorting + reversesort - True or False to sort in descending or ascending order + print empty - if True, stringify just the header for an empty table, if False return an empty string """ + + options = self._get_options(kwargs) + + lines = [] + + # Don't think too hard about an empty table + # Is this the desired behaviour? Maybe we should still print the header? + if self.rowcount == 0 and (not options["print_empty"] or not options["border"]): + return "" + + # Get the rows we need to print, taking into account slicing, sorting, etc. + rows = self._get_rows(options) + + # Turn all data in all rows into Unicode, formatted as desired + formatted_rows = self._format_rows(rows, options) + + # Compute column widths + self._compute_widths(formatted_rows, options) + + # Add header or top of border + self._hrule = self._stringify_hrule(options) + if options["header"]: + lines.append(self._stringify_header(options)) + elif options["border"] and options["hrules"] in (ALL, FRAME): + lines.append(self._hrule) + + # Add rows + for row in formatted_rows: + lines.append(self._stringify_row(row, options)) + + # Add bottom of border + if options["border"] and options["hrules"] == FRAME: + lines.append(self._hrule) + + return self._unicode("\n").join(lines) + + def _stringify_hrule(self, options): + + if not options["border"]: + return "" + lpad, rpad = self._get_padding_widths(options) + if options['vrules'] in (ALL, FRAME): + bits = [options["junction_char"]] + else: + bits = [options["horizontal_char"]] + # For tables with no data or fieldnames + if not self._field_names: + bits.append(options["junction_char"]) + return "".join(bits) + for field, width in zip(self._field_names, self._widths): + if options["fields"] and field not in options["fields"]: + continue + bits.append((width+lpad+rpad)*options["horizontal_char"]) + if options['vrules'] == ALL: + bits.append(options["junction_char"]) + else: + bits.append(options["horizontal_char"]) + if options["vrules"] == FRAME: + bits.pop() + bits.append(options["junction_char"]) + return "".join(bits) + + def _stringify_header(self, options): + + bits = [] + lpad, rpad = self._get_padding_widths(options) + if options["border"]: + if options["hrules"] in (ALL, FRAME): + bits.append(self._hrule) + bits.append("\n") + if options["vrules"] in (ALL, FRAME): + bits.append(options["vertical_char"]) + else: + bits.append(" ") + # For tables with no data or field names + if not self._field_names: + if options["vrules"] in (ALL, FRAME): + bits.append(options["vertical_char"]) + else: + bits.append(" ") + for field, width, in zip(self._field_names, self._widths): + if options["fields"] and field not in options["fields"]: + continue + if self._header_style == "cap": + fieldname = field.capitalize() + elif self._header_style == "title": + fieldname = field.title() + elif self._header_style == "upper": + fieldname = field.upper() + elif self._header_style == "lower": + fieldname = field.lower() + else: + fieldname = field + bits.append(" " * lpad + self._justify(fieldname, width, self._align[field]) + " " * rpad) + if options["border"]: + if options["vrules"] == ALL: + bits.append(options["vertical_char"]) + else: + bits.append(" ") + # If vrules is FRAME, then we just appended a space at the end + # of the last field, when we really want a vertical character + if options["border"] and options["vrules"] == FRAME: + bits.pop() + bits.append(options["vertical_char"]) + if options["border"] and options["hrules"] != NONE: + bits.append("\n") + bits.append(self._hrule) + return "".join(bits) + + def _stringify_row(self, row, options): + + for index, field, value, width, in zip(range(0,len(row)), self._field_names, row, self._widths): + # Enforce max widths + lines = value.split("\n") + new_lines = [] + for line in lines: + if _str_block_width(line) > width: + line = textwrap.fill(line, width) + new_lines.append(line) + lines = new_lines + value = "\n".join(lines) + row[index] = value + + row_height = 0 + for c in row: + h = _get_size(c)[1] + if h > row_height: + row_height = h + + bits = [] + lpad, rpad = self._get_padding_widths(options) + for y in range(0, row_height): + bits.append([]) + if options["border"]: + if options["vrules"] in (ALL, FRAME): + bits[y].append(self.vertical_char) + else: + bits[y].append(" ") + + for field, value, width, in zip(self._field_names, row, self._widths): + + valign = self._valign[field] + lines = value.split("\n") + dHeight = row_height - len(lines) + if dHeight: + if valign == "m": + lines = [""] * int(dHeight / 2) + lines + [""] * (dHeight - int(dHeight / 2)) + elif valign == "b": + lines = [""] * dHeight + lines + else: + lines = lines + [""] * dHeight + + y = 0 + for l in lines: + if options["fields"] and field not in options["fields"]: + continue + + bits[y].append(" " * lpad + self._justify(l, width, self._align[field]) + " " * rpad) + if options["border"]: + if options["vrules"] == ALL: + bits[y].append(self.vertical_char) + else: + bits[y].append(" ") + y += 1 + + # If vrules is FRAME, then we just appended a space at the end + # of the last field, when we really want a vertical character + for y in range(0, row_height): + if options["border"] and options["vrules"] == FRAME: + bits[y].pop() + bits[y].append(options["vertical_char"]) + + if options["border"] and options["hrules"]== ALL: + bits[row_height-1].append("\n") + bits[row_height-1].append(self._hrule) + + for y in range(0, row_height): + bits[y] = "".join(bits[y]) + + return "\n".join(bits) + + ############################## + # HTML STRING METHODS # + ############################## + + def get_html_string(self, **kwargs): + + """Return string representation of HTML formatted version of table in current state. + + Arguments: + + start - index of first data row to include in output + end - index of last data row to include in output PLUS ONE (list slice style) + fields - names of fields (columns) to include + header - print a header showing field names (True or False) + border - print a border around the table (True or False) + hrules - controls printing of horizontal rules after rows. Allowed values: ALL, FRAME, HEADER, NONE + vrules - controls printing of vertical rules between columns. Allowed values: FRAME, ALL, NONE + int_format - controls formatting of integer data + float_format - controls formatting of floating point data + padding_width - number of spaces on either side of column data (only used if left and right paddings are None) + left_padding_width - number of spaces on left hand side of column data + right_padding_width - number of spaces on right hand side of column data + sortby - name of field to sort rows by + sort_key - sorting key function, applied to data points before sorting + attributes - dictionary of name/value pairs to include as HTML attributes in the

tag + xhtml - print
tags if True,
tags if false""" + + options = self._get_options(kwargs) + + if options["format"]: + string = self._get_formatted_html_string(options) + else: + string = self._get_simple_html_string(options) + + return string + + def _get_simple_html_string(self, options): + + lines = [] + if options["xhtml"]: + linebreak = "
" + else: + linebreak = "
" + + open_tag = [] + open_tag.append("") + lines.append("".join(open_tag)) + + # Headers + if options["header"]: + lines.append(" ") + for field in self._field_names: + if options["fields"] and field not in options["fields"]: + continue + lines.append(" " % escape(field).replace("\n", linebreak)) + lines.append(" ") + + # Data + rows = self._get_rows(options) + formatted_rows = self._format_rows(rows, options) + for row in formatted_rows: + lines.append(" ") + for field, datum in zip(self._field_names, row): + if options["fields"] and field not in options["fields"]: + continue + lines.append(" " % escape(datum).replace("\n", linebreak)) + lines.append(" ") + + lines.append("

%s
%s

") + + return self._unicode("\n").join(lines) + + def _get_formatted_html_string(self, options): + + lines = [] + lpad, rpad = self._get_padding_widths(options) + if options["xhtml"]: + linebreak = "
" + else: + linebreak = "
" + + open_tag = [] + open_tag.append("") + lines.append("".join(open_tag)) + + # Headers + if options["header"]: + lines.append(" ") + for field in self._field_names: + if options["fields"] and field not in options["fields"]: + continue + lines.append(" %s" % (lpad, rpad, escape(field).replace("\n", linebreak))) + lines.append(" ") + + # Data + rows = self._get_rows(options) + formatted_rows = self._format_rows(rows, options) + aligns = [] + valigns = [] + for field in self._field_names: + aligns.append({ "l" : "left", "r" : "right", "c" : "center" }[self._align[field]]) + valigns.append({"t" : "top", "m" : "middle", "b" : "bottom"}[self._valign[field]]) + for row in formatted_rows: + lines.append(" ") + for field, datum, align, valign in zip(self._field_names, row, aligns, valigns): + if options["fields"] and field not in options["fields"]: + continue + lines.append(" %s" % (lpad, rpad, align, valign, escape(datum).replace("\n", linebreak))) + lines.append(" ") + lines.append("") + + return self._unicode("\n").join(lines) + +############################## +# UNICODE WIDTH FUNCTIONS # +############################## + +def _char_block_width(char): + # Basic Latin, which is probably the most common case + #if char in xrange(0x0021, 0x007e): + #if char >= 0x0021 and char <= 0x007e: + if 0x0021 <= char <= 0x007e: + return 1 + # Chinese, Japanese, Korean (common) + if 0x4e00 <= char <= 0x9fff: + return 2 + # Hangul + if 0xac00 <= char <= 0xd7af: + return 2 + # Combining? + if unicodedata.combining(uni_chr(char)): + return 0 + # Hiragana and Katakana + if 0x3040 <= char <= 0x309f or 0x30a0 <= char <= 0x30ff: + return 2 + # Full-width Latin characters + if 0xff01 <= char <= 0xff60: + return 2 + # CJK punctuation + if 0x3000 <= char <= 0x303e: + return 2 + # Backspace and delete + if char in (0x0008, 0x007f): + return -1 + # Other control characters + elif char in (0x0000, 0x001f): + return 0 + # Take a guess + return 1 + +def _str_block_width(val): + + return sum(itermap(_char_block_width, itermap(ord, _re.sub("", val)))) + +############################## +# TABLE FACTORIES # +############################## + +def from_csv(fp, field_names = None, **kwargs): + + dialect = csv.Sniffer().sniff(fp.read(1024)) + fp.seek(0) + reader = csv.reader(fp, dialect) + + table = PrettyTable(**kwargs) + if field_names: + table.field_names = field_names + else: + if py3k: + table.field_names = [x.strip() for x in next(reader)] + else: + table.field_names = [x.strip() for x in reader.next()] + + for row in reader: + table.add_row([x.strip() for x in row]) + + return table + +def from_db_cursor(cursor, **kwargs): + + if cursor.description: + table = PrettyTable(**kwargs) + table.field_names = [col[0] for col in cursor.description] + for row in cursor.fetchall(): + table.add_row(row) + return table + +class TableHandler(HTMLParser): + + def __init__(self, **kwargs): + HTMLParser.__init__(self) + self.kwargs = kwargs + self.tables = [] + self.last_row = [] + self.rows = [] + self.max_row_width = 0 + self.active = None + self.last_content = "" + self.is_last_row_header = False + + def handle_starttag(self,tag, attrs): + self.active = tag + if tag == "th": + self.is_last_row_header = True + + def handle_endtag(self,tag): + if tag in ["th", "td"]: + stripped_content = self.last_content.strip() + self.last_row.append(stripped_content) + if tag == "tr": + self.rows.append( + (self.last_row, self.is_last_row_header)) + self.max_row_width = max(self.max_row_width, len(self.last_row)) + self.last_row = [] + self.is_last_row_header = False + if tag == "table": + table = self.generate_table(self.rows) + self.tables.append(table) + self.rows = [] + self.last_content = " " + self.active = None + + + def handle_data(self, data): + self.last_content += data + + def generate_table(self, rows): + """ + Generates from a list of rows a PrettyTable object. + """ + table = PrettyTable(**self.kwargs) + for row in self.rows: + if len(row[0]) < self.max_row_width: + appends = self.max_row_width - len(row[0]) + for i in range(1,appends): + row[0].append("-") + + if row[1] == True: + self.make_fields_unique(row[0]) + table.field_names = row[0] + else: + table.add_row(row[0]) + return table + + def make_fields_unique(self, fields): + """ + iterates over the row and make each field unique + """ + for i in range(0, len(fields)): + for j in range(i+1, len(fields)): + if fields[i] == fields[j]: + fields[j] += "'" + +def from_html(html_code, **kwargs): + """ + Generates a list of PrettyTables from a string of HTML code. Each in + the HTML becomes one PrettyTable object. + """ + + parser = TableHandler(**kwargs) + parser.feed(html_code) + return parser.tables + +def from_html_one(html_code, **kwargs): + """ + Generates a PrettyTables from a string of HTML code which contains only a + single

+ """ + + tables = from_html(html_code, **kwargs) + try: + assert len(tables) == 1 + except AssertionError: + raise Exception("More than one

in provided HTML code! Use from_html instead.") + return tables[0] + +############################## +# MAIN (TEST FUNCTION) # +############################## + +def main(): + + x = PrettyTable(["City name", "Area", "Population", "Annual Rainfall"]) + x.sortby = "Population" + x.reversesort = True + x.int_format["Area"] = "04d" + x.float_format = "6.1f" + x.align["City name"] = "l" # Left align city names + x.add_row(["Adelaide", 1295, 1158259, 600.5]) + x.add_row(["Brisbane", 5905, 1857594, 1146.4]) + x.add_row(["Darwin", 112, 120900, 1714.7]) + x.add_row(["Hobart", 1357, 205556, 619.5]) + x.add_row(["Sydney", 2058, 4336374, 1214.8]) + x.add_row(["Melbourne", 1566, 3806092, 646.9]) + x.add_row(["Perth", 5386, 1554769, 869.4]) + print(x) + +if __name__ == "__main__": + main() diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/README remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/README --- remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/README 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/prettytable/README 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,498 @@ +TUTORIAL ON HOW TO USE THE PRETTYTABLE 0.6+ API + +*** This tutorial is distributed with PrettyTable and is meant to serve +as a "quick start" guide for the lazy or impatient. It is not an +exhaustive description of the whole API, and it is not guaranteed to be +100% up to date. For more complete and update documentation, check the +PrettyTable wiki at http://code.google.com/p/prettytable/w/list *** + += Getting your data into (and out of) the table = + +Let's suppose you have a shiny new PrettyTable: + +from prettytable import PrettyTable +x = PrettyTable() + +and you want to put some data into it. You have a few options. + +== Row by row == + +You can add data one row at a time. To do this you can set the field names +first using the `field_names` attribute, and then add the rows one at a time +using the `add_row` method: + +x.field_names = ["City name", "Area", "Population", "Annual Rainfall"] +x.add_row(["Adelaide",1295, 1158259, 600.5]) +x.add_row(["Brisbane",5905, 1857594, 1146.4]) +x.add_row(["Darwin", 112, 120900, 1714.7]) +x.add_row(["Hobart", 1357, 205556, 619.5]) +x.add_row(["Sydney", 2058, 4336374, 1214.8]) +x.add_row(["Melbourne", 1566, 3806092, 646.9]) +x.add_row(["Perth", 5386, 1554769, 869.4]) + +== Column by column == + +You can add data one column at a time as well. To do this you use the +`add_column` method, which takes two arguments - a string which is the name for +the field the column you are adding corresponds to, and a list or tuple which +contains the column data" + +x.add_column("City name", +["Adelaide","Brisbane","Darwin","Hobart","Sydney","Melbourne","Perth"]) +x.add_column("Area", [1295, 5905, 112, 1357, 2058, 1566, 5386]) +x.add_column("Population", [1158259, 1857594, 120900, 205556, 4336374, 3806092, +1554769]) +x.add_column("Annual Rainfall",[600.5, 1146.4, 1714.7, 619.5, 1214.8, 646.9, +869.4]) + +== Mixing and matching == + +If you really want to, you can even mix and match `add_row` and `add_column` +and build some of your table in one way and some of it in the other. There's a +unit test which makes sure that doing things this way will always work out +nicely as if you'd done it using just one of the two approaches. Tables built +this way are kind of confusing for other people to read, though, so don't do +this unless you have a good reason. + +== Importing data from a CSV file == + +If you have your table data in a comma separated values file (.csv), you can +read this data into a PrettyTable like this: + +from prettytable import from_csv +fp = open("myfile.csv", "r") +mytable = from_csv(fp) +fp.close() + +== Importing data from a HTML string == + +If you have a string containing a HTML

, you can read this data into a +PrettyTable like this: + +from prettytable import from_html +mytable = from_html(html_string) + +== Importing data from a database cursor == + +If you have your table data in a database which you can access using a library +which confirms to the Python DB-API (e.g. an SQLite database accessible using +the sqlite module), then you can build a PrettyTable using a cursor object, +like this: + +import sqlite3 +from prettytable import from_db_cursor + +connection = sqlite3.connect("mydb.db") +cursor = connection.cursor() +cursor.execute("SELECT field1, field2, field3 FROM my_table") +mytable = from_db_cursor(cursor) + +== Getting data out == + +There are three ways to get data out of a PrettyTable, in increasing order of +completeness: + + * The `del_row` method takes an integer index of a single row to delete. + * The `clear_rows` method takes no arguments and deletes all the rows in the +table - but keeps the field names as they were so you that you can repopulate +it with the same kind of data. + * The `clear` method takes no arguments and deletes all rows and all field +names. It's not quite the same as creating a fresh table instance, though - +style related settings, discussed later, are maintained. + += Displaying your table in ASCII form = + +PrettyTable's main goal is to let you print tables in an attractive ASCII form, +like this: + ++-----------+------+------------+-----------------+ +| City name | Area | Population | Annual Rainfall | ++-----------+------+------------+-----------------+ +| Adelaide | 1295 | 1158259 | 600.5 | +| Brisbane | 5905 | 1857594 | 1146.4 | +| Darwin | 112 | 120900 | 1714.7 | +| Hobart | 1357 | 205556 | 619.5 | +| Melbourne | 1566 | 3806092 | 646.9 | +| Perth | 5386 | 1554769 | 869.4 | +| Sydney | 2058 | 4336374 | 1214.8 | ++-----------+------+------------+-----------------+ + +You can print tables like this to `stdout` or get string representations of +them. + +== Printing == + +To print a table in ASCII form, you can just do this: + +print x + +in Python 2.x or: + +print(x) + +in Python 3.x. + +The old x.printt() method from versions 0.5 and earlier has been removed. + +To pass options changing the look of the table, use the get_string() method +documented below: + +print x.get_string() + +== Stringing == + +If you don't want to actually print your table in ASCII form but just get a +string containing what _would_ be printed if you use "print x", you can use +the `get_string` method: + +mystring = x.get_string() + +This string is guaranteed to look exactly the same as what would be printed by +doing "print x". You can now do all the usual things you can do with a +string, like write your table to a file or insert it into a GUI. + +== Controlling which data gets displayed == + +If you like, you can restrict the output of `print x` or `x.get_string` to +only the fields or rows you like. + +The `fields` argument to these methods takes a list of field names to be +printed: + +print x.get_string(fields=["City name", "Population"]) + +gives: + ++-----------+------------+ +| City name | Population | ++-----------+------------+ +| Adelaide | 1158259 | +| Brisbane | 1857594 | +| Darwin | 120900 | +| Hobart | 205556 | +| Melbourne | 3806092 | +| Perth | 1554769 | +| Sydney | 4336374 | ++-----------+------------+ + +The `start` and `end` arguments take the index of the first and last row to +print respectively. Note that the indexing works like Python list slicing - to +print the 2nd, 3rd and 4th rows of the table, set `start` to 1 (the first row +is row 0, so the second is row 1) and set `end` to 4 (the index of the 4th row, +plus 1): + +print x.get_string(start=1,end=4) + +prints: + ++-----------+------+------------+-----------------+ +| City name | Area | Population | Annual Rainfall | ++-----------+------+------------+-----------------+ +| Brisbane | 5905 | 1857594 | 1146.4 | +| Darwin | 112 | 120900 | 1714.7 | +| Hobart | 1357 | 205556 | 619.5 | ++-----------+------+------------+-----------------+ + +== Changing the alignment of columns == + +By default, all columns in a table are centre aligned. + +=== All columns at once === + +You can change the alignment of all the columns in a table at once by assigning +a one character string to the `align` attribute. The allowed strings are "l", +"r" and "c" for left, right and centre alignment, respectively: + +x.align = "r" +print x + +gives: + ++-----------+------+------------+-----------------+ +| City name | Area | Population | Annual Rainfall | ++-----------+------+------------+-----------------+ +| Adelaide | 1295 | 1158259 | 600.5 | +| Brisbane | 5905 | 1857594 | 1146.4 | +| Darwin | 112 | 120900 | 1714.7 | +| Hobart | 1357 | 205556 | 619.5 | +| Melbourne | 1566 | 3806092 | 646.9 | +| Perth | 5386 | 1554769 | 869.4 | +| Sydney | 2058 | 4336374 | 1214.8 | ++-----------+------+------------+-----------------+ + +=== One column at a time === + +You can also change the alignment of individual columns based on the +corresponding field name by treating the `align` attribute as if it were a +dictionary. + +x.align["City name"] = "l" +x.align["Area"] = "c" +x.align["Population"] = "r" +x.align["Annual Rainfall"] = "c" +print x + +gives: + ++-----------+------+------------+-----------------+ +| City name | Area | Population | Annual Rainfall | ++-----------+------+------------+-----------------+ +| Adelaide | 1295 | 1158259 | 600.5 | +| Brisbane | 5905 | 1857594 | 1146.4 | +| Darwin | 112 | 120900 | 1714.7 | +| Hobart | 1357 | 205556 | 619.5 | +| Melbourne | 1566 | 3806092 | 646.9 | +| Perth | 5386 | 1554769 | 869.4 | +| Sydney | 2058 | 4336374 | 1214.8 | ++-----------+------+------------+-----------------+ + +== Sorting your table by a field == + +You can make sure that your ASCII tables are produced with the data sorted by +one particular field by giving `get_string` a `sortby` keyword argument, which + must be a string containing the name of one field. + +For example, to print the example table we built earlier of Australian capital +city data, so that the most populated city comes last, we can do this: + +print x.get_string(sortby="Population") + +to get + ++-----------+------+------------+-----------------+ +| City name | Area | Population | Annual Rainfall | ++-----------+------+------------+-----------------+ +| Darwin | 112 | 120900 | 1714.7 | +| Hobart | 1357 | 205556 | 619.5 | +| Adelaide | 1295 | 1158259 | 600.5 | +| Perth | 5386 | 1554769 | 869.4 | +| Brisbane | 5905 | 1857594 | 1146.4 | +| Melbourne | 1566 | 3806092 | 646.9 | +| Sydney | 2058 | 4336374 | 1214.8 | ++-----------+------+------------+-----------------+ + +If we want the most populated city to come _first_, we can also give a +`reversesort=True` argument. + +If you _always_ want your tables to be sorted in a certain way, you can make +the setting long term like this: + +x.sortby = "Population" +print x +print x +print x + +All three tables printed by this code will be sorted by population (you could +do `x.reversesort = True` as well, if you wanted). The behaviour will persist +until you turn it off: + +x.sortby = None + +If you want to specify a custom sorting function, you can use the `sort_key` +keyword argument. Pass this a function which accepts two lists of values +and returns a negative or positive value depending on whether the first list +should appeare before or after the second one. If your table has n columns, +each list will have n+1 elements. Each list corresponds to one row of the +table. The first element will be whatever data is in the relevant row, in +the column specified by the `sort_by` argument. The remaining n elements +are the data in each of the table's columns, in order, including a repeated +instance of the data in the `sort_by` column. + += Changing the appearance of your table - the easy way = + +By default, PrettyTable produces ASCII tables that look like the ones used in +SQL database shells. But if can print them in a variety of other formats as +well. If the format you want to use is common, PrettyTable makes this very +easy for you to do using the `set_style` method. If you want to produce an +uncommon table, you'll have to do things slightly harder (see later). + +== Setting a table style == + +You can set the style for your table using the `set_style` method before any +calls to `print` or `get_string`. Here's how to print a table in a format +which works nicely with Microsoft Word's "Convert to table" feature: + +from prettytable import MSWORD_FRIENDLY +x.set_style(MSWORD_FRIENDLY) +print x + +In addition to `MSWORD_FRIENDLY` there are currently two other in-built styles +you can use for your tables: + + * `DEFAULT` - The default look, used to undo any style changes you may have +made + * `PLAIN_COLUMN` - A borderless style that works well with command line +programs for columnar data + +Other styles are likely to appear in future releases. + += Changing the appearance of your table - the hard way = + +If you want to display your table in a style other than one of the in-built +styles listed above, you'll have to set things up the hard way. + +Don't worry, it's not really that hard! + +== Style options == + +PrettyTable has a number of style options which control various aspects of how +tables are displayed. You have the freedom to set each of these options +individually to whatever you prefer. The `set_style` method just does this +automatically for you. + +The options are these: + + * `border` - A boolean option (must be `True` or `False`). Controls whether + or not a border is drawn around the table. + * `header` - A boolean option (must be `True` or `False`). Controls whether + or not the first row of the table is a header showing the names of all the + fields. + * `hrules` - Controls printing of horizontal rules after rows. Allowed + values: FRAME, HEADER, ALL, NONE - note that these are variables defined + inside the `prettytable` module so make sure you import them or use + `prettytable.FRAME` etc. + * `vrules` - Controls printing of vertical rules between columns. Allowed + values: FRAME, ALL, NONE. + * `int_format` - A string which controls the way integer data is printed. + This works like: print "%d" % data + * `float_format` - A string which controls the way floating point data is + printed. This works like: print "%f" % data + * `padding_width` - Number of spaces on either side of column data (only used + if left and right paddings are None). + * `left_padding_width` - Number of spaces on left hand side of column data. + * `right_padding_width` - Number of spaces on right hand side of column data. + * `vertical_char` - Single character string used to draw vertical lines. + Default is `|`. + * `horizontal_char` - Single character string used to draw horizontal lines. + Default is `-`. + * `junction_char` - Single character string used to draw line junctions. + Default is `+`. + +You can set the style options to your own settings in two ways: + +== Setting style options for the long term == + +If you want to print your table with a different style several times, you can +set your option for the "long term" just by changing the appropriate +attributes. If you never want your tables to have borders you can do this: + +x.border = False +print x +print x +print x + +Neither of the 3 tables printed by this will have borders, even if you do +things like add extra rows inbetween them. The lack of borders will last until +you do: + +x.border = True + +to turn them on again. This sort of long term setting is exactly how +`set_style` works. `set_style` just sets a bunch of attributes to pre-set +values for you. + +Note that if you know what style options you want at the moment you are +creating your table, you can specify them using keyword arguments to the +constructor. For example, the following two code blocks are equivalent: + +x = PrettyTable() +x.border = False +x.header = False +x.padding_width = 5 + +x = PrettyTable(border=False, header=False, padding_width=5) + +== Changing style options just once == + +If you don't want to make long term style changes by changing an attribute like +in the previous section, you can make changes that last for just one +``get_string`` by giving those methods keyword arguments. To print two +"normal" tables with one borderless table between them, you could do this: + +print x +print x.get_string(border=False) +print x + += Displaying your table in HTML form = + +PrettyTable will also print your tables in HTML form, as `

`s. Just like +in ASCII form, you can actually print your table - just use `print_html()` - or +get a string representation - just use `get_html_string()`. HTML printing +supports the `fields`, `start`, `end`, `sortby` and `reversesort` arguments in +exactly the same way as ASCII printing. + +== Styling HTML tables == + +By default, PrettyTable outputs HTML for "vanilla" tables. The HTML code is +quite simple. It looks like this: + +

+ + + + + + + + + + + + + + + + + ... + ... + ... +

City name	Area	Population	Annual Rainfall
Adelaide	1295	1158259	600.5
Brisbane	5905	1857594	1146.4

+ +If you like, you can ask PrettyTable to do its best to mimick the style options +that your table has set using inline CSS. This is done by giving a +`format=True` keyword argument to either the `print_html` or `get_html_string` +methods. Note that if you _always_ want to print formatted HTML you can do: + +x.format = True + +and the setting will persist until you turn it off. + +Just like with ASCII tables, if you want to change the table's style for just +one `print_html` or one `get_html_string` you can pass those methods keyword +arguments - exactly like `print` and `get_string`. + +== Setting HTML attributes == + +You can provide a dictionary of HTML attribute name/value pairs to the +`print_html` and `get_html_string` methods using the `attributes` keyword +argument. This lets you specify common HTML attributes like `name`, `id` and +`class` that can be used for linking to your tables or customising their +appearance using CSS. For example: + +x.print_html(attributes={"name":"my_table", "class":"red_table"}) + +will print: + + + + + + + + + ... + ... + ... +

City name	Area	Population	Annual Rainfall

+ += Miscellaneous things = + +== Copying a table == + +You can call the `copy` method on a PrettyTable object without arguments to +return an identical independent copy of the table. + +If you want a copy of a PrettyTable object with just a subset of the rows, +you can use list slicing notation: + +new_table = old_table[0:5] diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/LICENSE remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/LICENSE --- remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/LICENSE 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/LICENSE 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,18 @@ +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/pyparsing.py remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/pyparsing.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/pyparsing.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/pyparsing.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,3764 @@ +# module pyparsing.py +# +# Copyright (c) 2003-2013 Paul T. McGuire +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +__doc__ = \ +""" +pyparsing module - Classes and methods to define and execute parsing grammars + +The pyparsing module is an alternative approach to creating and executing simple grammars, +vs. the traditional lex/yacc approach, or the use of regular expressions. With pyparsing, you +don't need to learn a new syntax for defining grammars or matching expressions - the parsing module +provides a library of classes that you use to construct the grammar directly in Python. + +Here is a program to parse "Hello, World!" (or any greeting of the form C{", !"}):: + + from pyparsing import Word, alphas + + # define grammar of a greeting + greet = Word( alphas ) + "," + Word( alphas ) + "!" + + hello = "Hello, World!" + print (hello, "->", greet.parseString( hello )) + +The program outputs the following:: + + Hello, World! -> ['Hello', ',', 'World', '!'] + +The Python representation of the grammar is quite readable, owing to the self-explanatory +class names, and the use of '+', '|' and '^' operators. + +The parsed results returned from C{parseString()} can be accessed as a nested list, a dictionary, or an +object with named attributes. + +The pyparsing module handles some of the problems that are typically vexing when writing text parsers: + - extra or missing whitespace (the above program will also handle "Hello,World!", "Hello , World !", etc.) + - quoted strings + - embedded comments +""" + +__version__ = "2.0.3" +__versionTime__ = "16 Aug 2014 00:12" +__author__ = "Paul McGuire " + +import string +from weakref import ref as wkref +import copy +import sys +import warnings +import re +import sre_constants +import collections +import pprint +#~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) ) + +__all__ = [ +'And', 'CaselessKeyword', 'CaselessLiteral', 'CharsNotIn', 'Combine', 'Dict', 'Each', 'Empty', +'FollowedBy', 'Forward', 'GoToColumn', 'Group', 'Keyword', 'LineEnd', 'LineStart', 'Literal', +'MatchFirst', 'NoMatch', 'NotAny', 'OneOrMore', 'OnlyOnce', 'Optional', 'Or', +'ParseBaseException', 'ParseElementEnhance', 'ParseException', 'ParseExpression', 'ParseFatalException', +'ParseResults', 'ParseSyntaxException', 'ParserElement', 'QuotedString', 'RecursiveGrammarException', +'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter', 'Upcase', +'White', 'Word', 'WordEnd', 'WordStart', 'ZeroOrMore', +'alphanums', 'alphas', 'alphas8bit', 'anyCloseTag', 'anyOpenTag', 'cStyleComment', 'col', +'commaSeparatedList', 'commonHTMLEntity', 'countedArray', 'cppStyleComment', 'dblQuotedString', +'dblSlashComment', 'delimitedList', 'dictOf', 'downcaseTokens', 'empty', 'hexnums', +'htmlComment', 'javaStyleComment', 'keepOriginalText', 'line', 'lineEnd', 'lineStart', 'lineno', +'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral', +'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables', +'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity', +'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd', +'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute', +'indentedBlock', 'originalTextFor', 'ungroup', 'infixNotation','locatedExpr', +] + +PY_3 = sys.version.startswith('3') +if PY_3: + _MAX_INT = sys.maxsize + basestring = str + unichr = chr + _ustr = str + + # build list of single arg builtins, that can be used as parse actions + singleArgBuiltins = [sum, len, sorted, reversed, list, tuple, set, any, all, min, max] + +else: + _MAX_INT = sys.maxint + range = xrange + + def _ustr(obj): + """Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries + str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It + then < returns the unicode object | encodes it with the default encoding | ... >. + """ + if isinstance(obj,unicode): + return obj + + try: + # If this works, then _ustr(obj) has the same behaviour as str(obj), so + # it won't break any existing code. + return str(obj) + + except UnicodeEncodeError: + # The Python docs (http://docs.python.org/ref/customization.html#l2h-182) + # state that "The return value must be a string object". However, does a + # unicode object (being a subclass of basestring) count as a "string + # object"? + # If so, then return a unicode object: + return unicode(obj) + # Else encode it... but how? There are many choices... :) + # Replace unprintables with escape codes? + #return unicode(obj).encode(sys.getdefaultencoding(), 'backslashreplace_errors') + # Replace unprintables with question marks? + #return unicode(obj).encode(sys.getdefaultencoding(), 'replace') + # ... + + # build list of single arg builtins, tolerant of Python version, that can be used as parse actions + singleArgBuiltins = [] + import __builtin__ + for fname in "sum len sorted reversed list tuple set any all min max".split(): + try: + singleArgBuiltins.append(getattr(__builtin__,fname)) + except AttributeError: + continue + +_generatorType = type((y for y in range(1))) + +def _xml_escape(data): + """Escape &, <, >, ", ', etc. in a string of data.""" + + # ampersand must be replaced first + from_symbols = '&><"\'' + to_symbols = ('&'+s+';' for s in "amp gt lt quot apos".split()) + for from_,to_ in zip(from_symbols, to_symbols): + data = data.replace(from_, to_) + return data + +class _Constants(object): + pass + +alphas = string.ascii_lowercase + string.ascii_uppercase +nums = "0123456789" +hexnums = nums + "ABCDEFabcdef" +alphanums = alphas + nums +_bslash = chr(92) +printables = "".join(c for c in string.printable if c not in string.whitespace) + +class ParseBaseException(Exception): + """base exception class for all parsing runtime exceptions""" + # Performance tuning: we construct a *lot* of these, so keep this + # constructor as small and fast as possible + def __init__( self, pstr, loc=0, msg=None, elem=None ): + self.loc = loc + if msg is None: + self.msg = pstr + self.pstr = "" + else: + self.msg = msg + self.pstr = pstr + self.parserElement = elem + + def __getattr__( self, aname ): + """supported attributes by name are: + - lineno - returns the line number of the exception text + - col - returns the column number of the exception text + - line - returns the line containing the exception text + """ + if( aname == "lineno" ): + return lineno( self.loc, self.pstr ) + elif( aname in ("col", "column") ): + return col( self.loc, self.pstr ) + elif( aname == "line" ): + return line( self.loc, self.pstr ) + else: + raise AttributeError(aname) + + def __str__( self ): + return "%s (at char %d), (line:%d, col:%d)" % \ + ( self.msg, self.loc, self.lineno, self.column ) + def __repr__( self ): + return _ustr(self) + def markInputline( self, markerString = ">!<" ): + """Extracts the exception line from the input string, and marks + the location of the exception with a special symbol. + """ + line_str = self.line + line_column = self.column - 1 + if markerString: + line_str = "".join((line_str[:line_column], + markerString, line_str[line_column:])) + return line_str.strip() + def __dir__(self): + return "loc msg pstr parserElement lineno col line " \ + "markInputline __str__ __repr__".split() + +class ParseException(ParseBaseException): + """exception thrown when parse expressions don't match class; + supported attributes by name are: + - lineno - returns the line number of the exception text + - col - returns the column number of the exception text + - line - returns the line containing the exception text + """ + pass + +class ParseFatalException(ParseBaseException): + """user-throwable exception thrown when inconsistent parse content + is found; stops all parsing immediately""" + pass + +class ParseSyntaxException(ParseFatalException): + """just like C{L{ParseFatalException}}, but thrown internally when an + C{L{ErrorStop}} ('-' operator) indicates that parsing is to stop immediately because + an unbacktrackable syntax error has been found""" + def __init__(self, pe): + super(ParseSyntaxException, self).__init__( + pe.pstr, pe.loc, pe.msg, pe.parserElement) + +#~ class ReparseException(ParseBaseException): + #~ """Experimental class - parse actions can raise this exception to cause + #~ pyparsing to reparse the input string: + #~ - with a modified input string, and/or + #~ - with a modified start location + #~ Set the values of the ReparseException in the constructor, and raise the + #~ exception in a parse action to cause pyparsing to use the new string/location. + #~ Setting the values as None causes no change to be made. + #~ """ + #~ def __init_( self, newstring, restartLoc ): + #~ self.newParseText = newstring + #~ self.reparseLoc = restartLoc + +class RecursiveGrammarException(Exception): + """exception thrown by C{validate()} if the grammar could be improperly recursive""" + def __init__( self, parseElementList ): + self.parseElementTrace = parseElementList + + def __str__( self ): + return "RecursiveGrammarException: %s" % self.parseElementTrace + +class _ParseResultsWithOffset(object): + def __init__(self,p1,p2): + self.tup = (p1,p2) + def __getitem__(self,i): + return self.tup[i] + def __repr__(self): + return repr(self.tup) + def setOffset(self,i): + self.tup = (self.tup[0],i) + +class ParseResults(object): + """Structured parse results, to provide multiple means of access to the parsed data: + - as a list (C{len(results)}) + - by list index (C{results[0], results[1]}, etc.) + - by attribute (C{results.}) + """ + def __new__(cls, toklist, name=None, asList=True, modal=True ): + if isinstance(toklist, cls): + return toklist + retobj = object.__new__(cls) + retobj.__doinit = True + return retobj + + # Performance tuning: we construct a *lot* of these, so keep this + # constructor as small and fast as possible + def __init__( self, toklist, name=None, asList=True, modal=True, isinstance=isinstance ): + if self.__doinit: + self.__doinit = False + self.__name = None + self.__parent = None + self.__accumNames = {} + if isinstance(toklist, list): + self.__toklist = toklist[:] + elif isinstance(toklist, _generatorType): + self.__toklist = list(toklist) + else: + self.__toklist = [toklist] + self.__tokdict = dict() + + if name is not None and name: + if not modal: + self.__accumNames[name] = 0 + if isinstance(name,int): + name = _ustr(name) # will always return a str, but use _ustr for consistency + self.__name = name + if not (isinstance(toklist, (type(None), basestring, list)) and toklist in (None,'',[])): + if isinstance(toklist,basestring): + toklist = [ toklist ] + if asList: + if isinstance(toklist,ParseResults): + self[name] = _ParseResultsWithOffset(toklist.copy(),0) + else: + self[name] = _ParseResultsWithOffset(ParseResults(toklist[0]),0) + self[name].__name = name + else: + try: + self[name] = toklist[0] + except (KeyError,TypeError,IndexError): + self[name] = toklist + + def __getitem__( self, i ): + if isinstance( i, (int,slice) ): + return self.__toklist[i] + else: + if i not in self.__accumNames: + return self.__tokdict[i][-1][0] + else: + return ParseResults([ v[0] for v in self.__tokdict[i] ]) + + def __setitem__( self, k, v, isinstance=isinstance ): + if isinstance(v,_ParseResultsWithOffset): + self.__tokdict[k] = self.__tokdict.get(k,list()) + [v] + sub = v[0] + elif isinstance(k,int): + self.__toklist[k] = v + sub = v + else: + self.__tokdict[k] = self.__tokdict.get(k,list()) + [_ParseResultsWithOffset(v,0)] + sub = v + if isinstance(sub,ParseResults): + sub.__parent = wkref(self) + + def __delitem__( self, i ): + if isinstance(i,(int,slice)): + mylen = len( self.__toklist ) + del self.__toklist[i] + + # convert int to slice + if isinstance(i, int): + if i < 0: + i += mylen + i = slice(i, i+1) + # get removed indices + removed = list(range(*i.indices(mylen))) + removed.reverse() + # fixup indices in token dictionary + for name in self.__tokdict: + occurrences = self.__tokdict[name] + for j in removed: + for k, (value, position) in enumerate(occurrences): + occurrences[k] = _ParseResultsWithOffset(value, position - (position > j)) + else: + del self.__tokdict[i] + + def __contains__( self, k ): + return k in self.__tokdict + + def __len__( self ): return len( self.__toklist ) + def __bool__(self): return len( self.__toklist ) > 0 + __nonzero__ = __bool__ + def __iter__( self ): return iter( self.__toklist ) + def __reversed__( self ): return iter( self.__toklist[::-1] ) + def iterkeys( self ): + """Returns all named result keys.""" + if hasattr(self.__tokdict, "iterkeys"): + return self.__tokdict.iterkeys() + else: + return iter(self.__tokdict) + + def itervalues( self ): + """Returns all named result values.""" + return (self[k] for k in self.iterkeys()) + + def iteritems( self ): + return ((k, self[k]) for k in self.iterkeys()) + + if PY_3: + keys = iterkeys + values = itervalues + items = iteritems + else: + def keys( self ): + """Returns all named result keys.""" + return list(self.iterkeys()) + + def values( self ): + """Returns all named result values.""" + return list(self.itervalues()) + + def items( self ): + """Returns all named result keys and values as a list of tuples.""" + return list(self.iteritems()) + + def haskeys( self ): + """Since keys() returns an iterator, this method is helpful in bypassing + code that looks for the existence of any defined results names.""" + return bool(self.__tokdict) + + def pop( self, *args, **kwargs): + """Removes and returns item at specified index (default=last). + Supports both list and dict semantics for pop(). If passed no + argument or an integer argument, it will use list semantics + and pop tokens from the list of parsed tokens. If passed a + non-integer argument (most likely a string), it will use dict + semantics and pop the corresponding value from any defined + results names. A second default return value argument is + supported, just as in dict.pop().""" + if not args: + args = [-1] + for k,v in kwargs.items(): + if k == 'default': + args = (args[0], v) + else: + raise TypeError("pop() got an unexpected keyword argument '%s'" % k) + if (isinstance(args[0], int) or + len(args) == 1 or + args[0] in self): + index = args[0] + ret = self[index] + del self[index] + return ret + else: + defaultvalue = args[1] + return defaultvalue + + def get(self, key, defaultValue=None): + """Returns named result matching the given key, or if there is no + such name, then returns the given C{defaultValue} or C{None} if no + C{defaultValue} is specified.""" + if key in self: + return self[key] + else: + return defaultValue + + def insert( self, index, insStr ): + """Inserts new element at location index in the list of parsed tokens.""" + self.__toklist.insert(index, insStr) + # fixup indices in token dictionary + for name in self.__tokdict: + occurrences = self.__tokdict[name] + for k, (value, position) in enumerate(occurrences): + occurrences[k] = _ParseResultsWithOffset(value, position + (position > index)) + + def append( self, item ): + """Add single element to end of ParseResults list of elements.""" + self.__toklist.append(item) + + def extend( self, itemseq ): + """Add sequence of elements to end of ParseResults list of elements.""" + if isinstance(itemseq, ParseResults): + self += itemseq + else: + self.__toklist.extend(itemseq) + + def clear( self ): + """Clear all elements and results names.""" + del self.__toklist[:] + self.__tokdict.clear() + + def __getattr__( self, name ): + try: + return self[name] + except KeyError: + return "" + + if name in self.__tokdict: + if name not in self.__accumNames: + return self.__tokdict[name][-1][0] + else: + return ParseResults([ v[0] for v in self.__tokdict[name] ]) + else: + return "" + + def __add__( self, other ): + ret = self.copy() + ret += other + return ret + + def __iadd__( self, other ): + if other.__tokdict: + offset = len(self.__toklist) + addoffset = ( lambda a: (a<0 and offset) or (a+offset) ) + otheritems = other.__tokdict.items() + otherdictitems = [(k, _ParseResultsWithOffset(v[0],addoffset(v[1])) ) + for (k,vlist) in otheritems for v in vlist] + for k,v in otherdictitems: + self[k] = v + if isinstance(v[0],ParseResults): + v[0].__parent = wkref(self) + + self.__toklist += other.__toklist + self.__accumNames.update( other.__accumNames ) + return self + + def __radd__(self, other): + if isinstance(other,int) and other == 0: + return self.copy() + + def __repr__( self ): + return "(%s, %s)" % ( repr( self.__toklist ), repr( self.__tokdict ) ) + + def __str__( self ): + out = [] + for i in self.__toklist: + if isinstance(i, ParseResults): + out.append(_ustr(i)) + else: + out.append(repr(i)) + return '[' + ', '.join(out) + ']' + + def _asStringList( self, sep='' ): + out = [] + for item in self.__toklist: + if out and sep: + out.append(sep) + if isinstance( item, ParseResults ): + out += item._asStringList() + else: + out.append( _ustr(item) ) + return out + + def asList( self ): + """Returns the parse results as a nested list of matching tokens, all converted to strings.""" + out = [] + for res in self.__toklist: + if isinstance(res,ParseResults): + out.append( res.asList() ) + else: + out.append( res ) + return out + + def asDict( self ): + """Returns the named parse results as dictionary.""" + if PY_3: + return dict( self.items() ) + else: + return dict( self.iteritems() ) + + def copy( self ): + """Returns a new copy of a C{ParseResults} object.""" + ret = ParseResults( self.__toklist ) + ret.__tokdict = self.__tokdict.copy() + ret.__parent = self.__parent + ret.__accumNames.update( self.__accumNames ) + ret.__name = self.__name + return ret + + def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ): + """Returns the parse results as XML. Tags are created for tokens and lists that have defined results names.""" + nl = "\n" + out = [] + namedItems = dict((v[1],k) for (k,vlist) in self.__tokdict.items() + for v in vlist) + nextLevelIndent = indent + " " + + # collapse out indents if formatting is not desired + if not formatted: + indent = "" + nextLevelIndent = "" + nl = "" + + selfTag = None + if doctag is not None: + selfTag = doctag + else: + if self.__name: + selfTag = self.__name + + if not selfTag: + if namedItemsOnly: + return "" + else: + selfTag = "ITEM" + + out += [ nl, indent, "<", selfTag, ">" ] + + worklist = self.__toklist + for i,res in enumerate(worklist): + if isinstance(res,ParseResults): + if i in namedItems: + out += [ res.asXML(namedItems[i], + namedItemsOnly and doctag is None, + nextLevelIndent, + formatted)] + else: + out += [ res.asXML(None, + namedItemsOnly and doctag is None, + nextLevelIndent, + formatted)] + else: + # individual token, see if there is a name for it + resTag = None + if i in namedItems: + resTag = namedItems[i] + if not resTag: + if namedItemsOnly: + continue + else: + resTag = "ITEM" + xmlBodyText = _xml_escape(_ustr(res)) + out += [ nl, nextLevelIndent, "<", resTag, ">", + xmlBodyText, + "" ] + + out += [ nl, indent, "" ] + return "".join(out) + + def __lookup(self,sub): + for k,vlist in self.__tokdict.items(): + for v,loc in vlist: + if sub is v: + return k + return None + + def getName(self): + """Returns the results name for this token expression.""" + if self.__name: + return self.__name + elif self.__parent: + par = self.__parent() + if par: + return par.__lookup(self) + else: + return None + elif (len(self) == 1 and + len(self.__tokdict) == 1 and + self.__tokdict.values()[0][0][1] in (0,-1)): + return self.__tokdict.keys()[0] + else: + return None + + def dump(self,indent='',depth=0): + """Diagnostic method for listing out the contents of a C{ParseResults}. + Accepts an optional C{indent} argument so that this string can be embedded + in a nested display of other data.""" + out = [] + NL = '\n' + out.append( indent+_ustr(self.asList()) ) + items = sorted(self.items()) + for k,v in items: + if out: + out.append(NL) + out.append( "%s%s- %s: " % (indent,(' '*depth), k) ) + if isinstance(v,ParseResults): + if v: + if v.haskeys(): + out.append( v.dump(indent,depth+1) ) + elif any(isinstance(vv,ParseResults) for vv in v): + for i,vv in enumerate(v): + if isinstance(vv,ParseResults): + out.append("\n%s%s[%d]:\n%s%s%s" % (indent,(' '*(depth+1)),i,indent,(' '*(depth+2)),vv.dump(indent,depth+2) )) + else: + out.append("\n%s%s[%d]:\n%s%s%s" % (indent,(' '*(depth+1)),i,indent,(' '*(depth+2)),_ustr(vv))) + else: + out.append(_ustr(v)) + else: + out.append(_ustr(v)) + else: + out.append(_ustr(v)) + return "".join(out) + + def pprint(self, *args, **kwargs): + """Pretty-printer for parsed results as a list, using the C{pprint} module. + Accepts additional positional or keyword args as defined for the + C{pprint.pprint} method. (U{http://docs.python.org/3/library/pprint.html#pprint.pprint})""" + pprint.pprint(self.asList(), *args, **kwargs) + + # add support for pickle protocol + def __getstate__(self): + return ( self.__toklist, + ( self.__tokdict.copy(), + self.__parent is not None and self.__parent() or None, + self.__accumNames, + self.__name ) ) + + def __setstate__(self,state): + self.__toklist = state[0] + (self.__tokdict, + par, + inAccumNames, + self.__name) = state[1] + self.__accumNames = {} + self.__accumNames.update(inAccumNames) + if par is not None: + self.__parent = wkref(par) + else: + self.__parent = None + + def __dir__(self): + return dir(super(ParseResults,self)) + list(self.keys()) + +collections.MutableMapping.register(ParseResults) + +def col (loc,strg): + """Returns current column within a string, counting newlines as line separators. + The first column is number 1. + + Note: the default parsing behavior is to expand tabs in the input string + before starting the parsing process. See L{I{ParserElement.parseString}} for more information + on parsing strings containing C{}s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + return (loc} for more information + on parsing strings containing C{}s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + return strg.count("\n",0,loc) + 1 + +def line( loc, strg ): + """Returns the line of text containing loc within a string, counting newlines as line separators. + """ + lastCR = strg.rfind("\n", 0, loc) + nextCR = strg.find("\n", loc) + if nextCR >= 0: + return strg[lastCR+1:nextCR] + else: + return strg[lastCR+1:] + +def _defaultStartDebugAction( instring, loc, expr ): + print (("Match " + _ustr(expr) + " at loc " + _ustr(loc) + "(%d,%d)" % ( lineno(loc,instring), col(loc,instring) ))) + +def _defaultSuccessDebugAction( instring, startloc, endloc, expr, toks ): + print ("Matched " + _ustr(expr) + " -> " + str(toks.asList())) + +def _defaultExceptionDebugAction( instring, loc, expr, exc ): + print ("Exception raised:" + _ustr(exc)) + +def nullDebugAction(*args): + """'Do-nothing' debug action, to suppress debugging output during parsing.""" + pass + +# Only works on Python 3.x - nonlocal is toxic to Python 2 installs +#~ 'decorator to trim function calls to match the arity of the target' +#~ def _trim_arity(func, maxargs=3): + #~ if func in singleArgBuiltins: + #~ return lambda s,l,t: func(t) + #~ limit = 0 + #~ foundArity = False + #~ def wrapper(*args): + #~ nonlocal limit,foundArity + #~ while 1: + #~ try: + #~ ret = func(*args[limit:]) + #~ foundArity = True + #~ return ret + #~ except TypeError: + #~ if limit == maxargs or foundArity: + #~ raise + #~ limit += 1 + #~ continue + #~ return wrapper + +# this version is Python 2.x-3.x cross-compatible +'decorator to trim function calls to match the arity of the target' +def _trim_arity(func, maxargs=2): + if func in singleArgBuiltins: + return lambda s,l,t: func(t) + limit = [0] + foundArity = [False] + def wrapper(*args): + while 1: + try: + ret = func(*args[limit[0]:]) + foundArity[0] = True + return ret + except TypeError: + if limit[0] <= maxargs and not foundArity[0]: + limit[0] += 1 + continue + raise + return wrapper + +class ParserElement(object): + """Abstract base level parser element class.""" + DEFAULT_WHITE_CHARS = " \n\t\r" + verbose_stacktrace = False + + def setDefaultWhitespaceChars( chars ): + """Overrides the default whitespace chars + """ + ParserElement.DEFAULT_WHITE_CHARS = chars + setDefaultWhitespaceChars = staticmethod(setDefaultWhitespaceChars) + + def inlineLiteralsUsing(cls): + """ + Set class to be used for inclusion of string literals into a parser. + """ + ParserElement.literalStringClass = cls + inlineLiteralsUsing = staticmethod(inlineLiteralsUsing) + + def __init__( self, savelist=False ): + self.parseAction = list() + self.failAction = None + #~ self.name = "" # don't define self.name, let subclasses try/except upcall + self.strRepr = None + self.resultsName = None + self.saveAsList = savelist + self.skipWhitespace = True + self.whiteChars = ParserElement.DEFAULT_WHITE_CHARS + self.copyDefaultWhiteChars = True + self.mayReturnEmpty = False # used when checking for left-recursion + self.keepTabs = False + self.ignoreExprs = list() + self.debug = False + self.streamlined = False + self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index + self.errmsg = "" + self.modalResults = True # used to mark results names as modal (report only last) or cumulative (list all) + self.debugActions = ( None, None, None ) #custom debug actions + self.re = None + self.callPreparse = True # used to avoid redundant calls to preParse + self.callDuringTry = False + + def copy( self ): + """Make a copy of this C{ParserElement}. Useful for defining different parse actions + for the same parsing pattern, using copies of the original parse element.""" + cpy = copy.copy( self ) + cpy.parseAction = self.parseAction[:] + cpy.ignoreExprs = self.ignoreExprs[:] + if self.copyDefaultWhiteChars: + cpy.whiteChars = ParserElement.DEFAULT_WHITE_CHARS + return cpy + + def setName( self, name ): + """Define name for this expression, for use in debugging.""" + self.name = name + self.errmsg = "Expected " + self.name + if hasattr(self,"exception"): + self.exception.msg = self.errmsg + return self + + def setResultsName( self, name, listAllMatches=False ): + """Define name for referencing matching tokens as a nested attribute + of the returned parse results. + NOTE: this returns a *copy* of the original C{ParserElement} object; + this is so that the client can define a basic element, such as an + integer, and reference it in multiple places with different names. + + You can also set results names using the abbreviated syntax, + C{expr("name")} in place of C{expr.setResultsName("name")} - + see L{I{__call__}<__call__>}. + """ + newself = self.copy() + if name.endswith("*"): + name = name[:-1] + listAllMatches=True + newself.resultsName = name + newself.modalResults = not listAllMatches + return newself + + def setBreak(self,breakFlag = True): + """Method to invoke the Python pdb debugger when this element is + about to be parsed. Set C{breakFlag} to True to enable, False to + disable. + """ + if breakFlag: + _parseMethod = self._parse + def breaker(instring, loc, doActions=True, callPreParse=True): + import pdb + pdb.set_trace() + return _parseMethod( instring, loc, doActions, callPreParse ) + breaker._originalParseMethod = _parseMethod + self._parse = breaker + else: + if hasattr(self._parse,"_originalParseMethod"): + self._parse = self._parse._originalParseMethod + return self + + def setParseAction( self, *fns, **kwargs ): + """Define action to perform when successfully matching parse element definition. + Parse action fn is a callable method with 0-3 arguments, called as C{fn(s,loc,toks)}, + C{fn(loc,toks)}, C{fn(toks)}, or just C{fn()}, where: + - s = the original string being parsed (see note below) + - loc = the location of the matching substring + - toks = a list of the matched tokens, packaged as a C{L{ParseResults}} object + If the functions in fns modify the tokens, they can return them as the return + value from fn, and the modified list of tokens will replace the original. + Otherwise, fn does not need to return any value. + + Note: the default parsing behavior is to expand tabs in the input string + before starting the parsing process. See L{I{parseString}} for more information + on parsing strings containing C{}s, and suggested methods to maintain a + consistent view of the parsed string, the parse location, and line and column + positions within the parsed string. + """ + self.parseAction = list(map(_trim_arity, list(fns))) + self.callDuringTry = ("callDuringTry" in kwargs and kwargs["callDuringTry"]) + return self + + def addParseAction( self, *fns, **kwargs ): + """Add parse action to expression's list of parse actions. See L{I{setParseAction}}.""" + self.parseAction += list(map(_trim_arity, list(fns))) + self.callDuringTry = self.callDuringTry or ("callDuringTry" in kwargs and kwargs["callDuringTry"]) + return self + + def setFailAction( self, fn ): + """Define action to perform if parsing fails at this expression. + Fail acton fn is a callable function that takes the arguments + C{fn(s,loc,expr,err)} where: + - s = string being parsed + - loc = location where expression match was attempted and failed + - expr = the parse expression that failed + - err = the exception thrown + The function returns no value. It may throw C{L{ParseFatalException}} + if it is desired to stop parsing immediately.""" + self.failAction = fn + return self + + def _skipIgnorables( self, instring, loc ): + exprsFound = True + while exprsFound: + exprsFound = False + for e in self.ignoreExprs: + try: + while 1: + loc,dummy = e._parse( instring, loc ) + exprsFound = True + except ParseException: + pass + return loc + + def preParse( self, instring, loc ): + if self.ignoreExprs: + loc = self._skipIgnorables( instring, loc ) + + if self.skipWhitespace: + wt = self.whiteChars + instrlen = len(instring) + while loc < instrlen and instring[loc] in wt: + loc += 1 + + return loc + + def parseImpl( self, instring, loc, doActions=True ): + return loc, [] + + def postParse( self, instring, loc, tokenlist ): + return tokenlist + + #~ @profile + def _parseNoCache( self, instring, loc, doActions=True, callPreParse=True ): + debugging = ( self.debug ) #and doActions ) + + if debugging or self.failAction: + #~ print ("Match",self,"at loc",loc,"(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )) + if (self.debugActions[0] ): + self.debugActions[0]( instring, loc, self ) + if callPreParse and self.callPreparse: + preloc = self.preParse( instring, loc ) + else: + preloc = loc + tokensStart = preloc + try: + try: + loc,tokens = self.parseImpl( instring, preloc, doActions ) + except IndexError: + raise ParseException( instring, len(instring), self.errmsg, self ) + except ParseBaseException as err: + #~ print ("Exception raised:", err) + if self.debugActions[2]: + self.debugActions[2]( instring, tokensStart, self, err ) + if self.failAction: + self.failAction( instring, tokensStart, self, err ) + raise + else: + if callPreParse and self.callPreparse: + preloc = self.preParse( instring, loc ) + else: + preloc = loc + tokensStart = preloc + if self.mayIndexError or loc >= len(instring): + try: + loc,tokens = self.parseImpl( instring, preloc, doActions ) + except IndexError: + raise ParseException( instring, len(instring), self.errmsg, self ) + else: + loc,tokens = self.parseImpl( instring, preloc, doActions ) + + tokens = self.postParse( instring, loc, tokens ) + + retTokens = ParseResults( tokens, self.resultsName, asList=self.saveAsList, modal=self.modalResults ) + if self.parseAction and (doActions or self.callDuringTry): + if debugging: + try: + for fn in self.parseAction: + tokens = fn( instring, tokensStart, retTokens ) + if tokens is not None: + retTokens = ParseResults( tokens, + self.resultsName, + asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), + modal=self.modalResults ) + except ParseBaseException as err: + #~ print "Exception raised in user parse action:", err + if (self.debugActions[2] ): + self.debugActions[2]( instring, tokensStart, self, err ) + raise + else: + for fn in self.parseAction: + tokens = fn( instring, tokensStart, retTokens ) + if tokens is not None: + retTokens = ParseResults( tokens, + self.resultsName, + asList=self.saveAsList and isinstance(tokens,(ParseResults,list)), + modal=self.modalResults ) + + if debugging: + #~ print ("Matched",self,"->",retTokens.asList()) + if (self.debugActions[1] ): + self.debugActions[1]( instring, tokensStart, loc, self, retTokens ) + + return loc, retTokens + + def tryParse( self, instring, loc ): + try: + return self._parse( instring, loc, doActions=False )[0] + except ParseFatalException: + raise ParseException( instring, loc, self.errmsg, self) + + # this method gets repeatedly called during backtracking with the same arguments - + # we can cache these arguments and save ourselves the trouble of re-parsing the contained expression + def _parseCache( self, instring, loc, doActions=True, callPreParse=True ): + lookup = (self,instring,loc,callPreParse,doActions) + if lookup in ParserElement._exprArgCache: + value = ParserElement._exprArgCache[ lookup ] + if isinstance(value, Exception): + raise value + return (value[0],value[1].copy()) + else: + try: + value = self._parseNoCache( instring, loc, doActions, callPreParse ) + ParserElement._exprArgCache[ lookup ] = (value[0],value[1].copy()) + return value + except ParseBaseException as pe: + pe.__traceback__ = None + ParserElement._exprArgCache[ lookup ] = pe + raise + + _parse = _parseNoCache + + # argument cache for optimizing repeated calls when backtracking through recursive expressions + _exprArgCache = {} + def resetCache(): + ParserElement._exprArgCache.clear() + resetCache = staticmethod(resetCache) + + _packratEnabled = False + def enablePackrat(): + """Enables "packrat" parsing, which adds memoizing to the parsing logic. + Repeated parse attempts at the same string location (which happens + often in many complex grammars) can immediately return a cached value, + instead of re-executing parsing/validating code. Memoizing is done of + both valid results and parsing exceptions. + + This speedup may break existing programs that use parse actions that + have side-effects. For this reason, packrat parsing is disabled when + you first import pyparsing. To activate the packrat feature, your + program must call the class method C{ParserElement.enablePackrat()}. If + your program uses C{psyco} to "compile as you go", you must call + C{enablePackrat} before calling C{psyco.full()}. If you do not do this, + Python will crash. For best results, call C{enablePackrat()} immediately + after importing pyparsing. + """ + if not ParserElement._packratEnabled: + ParserElement._packratEnabled = True + ParserElement._parse = ParserElement._parseCache + enablePackrat = staticmethod(enablePackrat) + + def parseString( self, instring, parseAll=False ): + """Execute the parse expression with the given string. + This is the main interface to the client code, once the complete + expression has been built. + + If you want the grammar to require that the entire input string be + successfully parsed, then set C{parseAll} to True (equivalent to ending + the grammar with C{L{StringEnd()}}). + + Note: C{parseString} implicitly calls C{expandtabs()} on the input string, + in order to report proper column numbers in parse actions. + If the input string contains tabs and + the grammar uses parse actions that use the C{loc} argument to index into the + string being parsed, you can ensure you have a consistent view of the input + string by: + - calling C{parseWithTabs} on your grammar before calling C{parseString} + (see L{I{parseWithTabs}}) + - define your parse action using the full C{(s,loc,toks)} signature, and + reference the input string using the parse action's C{s} argument + - explictly expand the tabs in your input string before calling + C{parseString} + """ + ParserElement.resetCache() + if not self.streamlined: + self.streamline() + #~ self.saveAsList = True + for e in self.ignoreExprs: + e.streamline() + if not self.keepTabs: + instring = instring.expandtabs() + try: + loc, tokens = self._parse( instring, 0 ) + if parseAll: + loc = self.preParse( instring, loc ) + se = Empty() + StringEnd() + se._parse( instring, loc ) + except ParseBaseException as exc: + if ParserElement.verbose_stacktrace: + raise + else: + # catch and re-raise exception from here, clears out pyparsing internal stack trace + raise exc + else: + return tokens + + def scanString( self, instring, maxMatches=_MAX_INT, overlap=False ): + """Scan the input string for expression matches. Each match will return the + matching tokens, start location, and end location. May be called with optional + C{maxMatches} argument, to clip scanning after 'n' matches are found. If + C{overlap} is specified, then overlapping matches will be reported. + + Note that the start and end locations are reported relative to the string + being parsed. See L{I{parseString}} for more information on parsing + strings with embedded tabs.""" + if not self.streamlined: + self.streamline() + for e in self.ignoreExprs: + e.streamline() + + if not self.keepTabs: + instring = _ustr(instring).expandtabs() + instrlen = len(instring) + loc = 0 + preparseFn = self.preParse + parseFn = self._parse + ParserElement.resetCache() + matches = 0 + try: + while loc <= instrlen and matches < maxMatches: + try: + preloc = preparseFn( instring, loc ) + nextLoc,tokens = parseFn( instring, preloc, callPreParse=False ) + except ParseException: + loc = preloc+1 + else: + if nextLoc > loc: + matches += 1 + yield tokens, preloc, nextLoc + if overlap: + nextloc = preparseFn( instring, loc ) + if nextloc > loc: + loc = nextLoc + else: + loc += 1 + else: + loc = nextLoc + else: + loc = preloc+1 + except ParseBaseException as exc: + if ParserElement.verbose_stacktrace: + raise + else: + # catch and re-raise exception from here, clears out pyparsing internal stack trace + raise exc + + def transformString( self, instring ): + """Extension to C{L{scanString}}, to modify matching text with modified tokens that may + be returned from a parse action. To use C{transformString}, define a grammar and + attach a parse action to it that modifies the returned token list. + Invoking C{transformString()} on a target string will then scan for matches, + and replace the matched text patterns according to the logic in the parse + action. C{transformString()} returns the resulting transformed string.""" + out = [] + lastE = 0 + # force preservation of s, to minimize unwanted transformation of string, and to + # keep string locs straight between transformString and scanString + self.keepTabs = True + try: + for t,s,e in self.scanString( instring ): + out.append( instring[lastE:s] ) + if t: + if isinstance(t,ParseResults): + out += t.asList() + elif isinstance(t,list): + out += t + else: + out.append(t) + lastE = e + out.append(instring[lastE:]) + out = [o for o in out if o] + return "".join(map(_ustr,_flatten(out))) + except ParseBaseException as exc: + if ParserElement.verbose_stacktrace: + raise + else: + # catch and re-raise exception from here, clears out pyparsing internal stack trace + raise exc + + def searchString( self, instring, maxMatches=_MAX_INT ): + """Another extension to C{L{scanString}}, simplifying the access to the tokens found + to match the given parse expression. May be called with optional + C{maxMatches} argument, to clip searching after 'n' matches are found. + """ + try: + return ParseResults([ t for t,s,e in self.scanString( instring, maxMatches ) ]) + except ParseBaseException as exc: + if ParserElement.verbose_stacktrace: + raise + else: + # catch and re-raise exception from here, clears out pyparsing internal stack trace + raise exc + + def __add__(self, other ): + """Implementation of + operator - returns C{L{And}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return And( [ self, other ] ) + + def __radd__(self, other ): + """Implementation of + operator when left operand is not a C{L{ParserElement}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return other + self + + def __sub__(self, other): + """Implementation of - operator, returns C{L{And}} with error stop""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return And( [ self, And._ErrorStop(), other ] ) + + def __rsub__(self, other ): + """Implementation of - operator when left operand is not a C{L{ParserElement}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return other - self + + def __mul__(self,other): + """Implementation of * operator, allows use of C{expr * 3} in place of + C{expr + expr + expr}. Expressions may also me multiplied by a 2-integer + tuple, similar to C{{min,max}} multipliers in regular expressions. Tuples + may also include C{None} as in: + - C{expr*(n,None)} or C{expr*(n,)} is equivalent + to C{expr*n + L{ZeroOrMore}(expr)} + (read as "at least n instances of C{expr}") + - C{expr*(None,n)} is equivalent to C{expr*(0,n)} + (read as "0 to n instances of C{expr}") + - C{expr*(None,None)} is equivalent to C{L{ZeroOrMore}(expr)} + - C{expr*(1,None)} is equivalent to C{L{OneOrMore}(expr)} + + Note that C{expr*(None,n)} does not raise an exception if + more than n exprs exist in the input stream; that is, + C{expr*(None,n)} does not enforce a maximum number of expr + occurrences. If this behavior is desired, then write + C{expr*(None,n) + ~expr} + + """ + if isinstance(other,int): + minElements, optElements = other,0 + elif isinstance(other,tuple): + other = (other + (None, None))[:2] + if other[0] is None: + other = (0, other[1]) + if isinstance(other[0],int) and other[1] is None: + if other[0] == 0: + return ZeroOrMore(self) + if other[0] == 1: + return OneOrMore(self) + else: + return self*other[0] + ZeroOrMore(self) + elif isinstance(other[0],int) and isinstance(other[1],int): + minElements, optElements = other + optElements -= minElements + else: + raise TypeError("cannot multiply 'ParserElement' and ('%s','%s') objects", type(other[0]),type(other[1])) + else: + raise TypeError("cannot multiply 'ParserElement' and '%s' objects", type(other)) + + if minElements < 0: + raise ValueError("cannot multiply ParserElement by negative value") + if optElements < 0: + raise ValueError("second tuple value must be greater or equal to first tuple value") + if minElements == optElements == 0: + raise ValueError("cannot multiply ParserElement by 0 or (0,0)") + + if (optElements): + def makeOptionalList(n): + if n>1: + return Optional(self + makeOptionalList(n-1)) + else: + return Optional(self) + if minElements: + if minElements == 1: + ret = self + makeOptionalList(optElements) + else: + ret = And([self]*minElements) + makeOptionalList(optElements) + else: + ret = makeOptionalList(optElements) + else: + if minElements == 1: + ret = self + else: + ret = And([self]*minElements) + return ret + + def __rmul__(self, other): + return self.__mul__(other) + + def __or__(self, other ): + """Implementation of | operator - returns C{L{MatchFirst}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return MatchFirst( [ self, other ] ) + + def __ror__(self, other ): + """Implementation of | operator when left operand is not a C{L{ParserElement}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return other | self + + def __xor__(self, other ): + """Implementation of ^ operator - returns C{L{Or}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return Or( [ self, other ] ) + + def __rxor__(self, other ): + """Implementation of ^ operator when left operand is not a C{L{ParserElement}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return other ^ self + + def __and__(self, other ): + """Implementation of & operator - returns C{L{Each}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return Each( [ self, other ] ) + + def __rand__(self, other ): + """Implementation of & operator when left operand is not a C{L{ParserElement}}""" + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + if not isinstance( other, ParserElement ): + warnings.warn("Cannot combine element of type %s with ParserElement" % type(other), + SyntaxWarning, stacklevel=2) + return None + return other & self + + def __invert__( self ): + """Implementation of ~ operator - returns C{L{NotAny}}""" + return NotAny( self ) + + def __call__(self, name=None): + """Shortcut for C{L{setResultsName}}, with C{listAllMatches=default}:: + userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno") + could be written as:: + userdata = Word(alphas)("name") + Word(nums+"-")("socsecno") + + If C{name} is given with a trailing C{'*'} character, then C{listAllMatches} will be + passed as C{True}. + + If C{name} is omitted, same as calling C{L{copy}}. + """ + if name is not None: + return self.setResultsName(name) + else: + return self.copy() + + def suppress( self ): + """Suppresses the output of this C{ParserElement}; useful to keep punctuation from + cluttering up returned output. + """ + return Suppress( self ) + + def leaveWhitespace( self ): + """Disables the skipping of whitespace before matching the characters in the + C{ParserElement}'s defined pattern. This is normally only used internally by + the pyparsing module, but may be needed in some whitespace-sensitive grammars. + """ + self.skipWhitespace = False + return self + + def setWhitespaceChars( self, chars ): + """Overrides the default whitespace chars + """ + self.skipWhitespace = True + self.whiteChars = chars + self.copyDefaultWhiteChars = False + return self + + def parseWithTabs( self ): + """Overrides default behavior to expand C{}s to spaces before parsing the input string. + Must be called before C{parseString} when the input grammar contains elements that + match C{} characters.""" + self.keepTabs = True + return self + + def ignore( self, other ): + """Define expression to be ignored (e.g., comments) while doing pattern + matching; may be called repeatedly, to define multiple comment or other + ignorable patterns. + """ + if isinstance( other, Suppress ): + if other not in self.ignoreExprs: + self.ignoreExprs.append( other.copy() ) + else: + self.ignoreExprs.append( Suppress( other.copy() ) ) + return self + + def setDebugActions( self, startAction, successAction, exceptionAction ): + """Enable display of debugging messages while doing pattern matching.""" + self.debugActions = (startAction or _defaultStartDebugAction, + successAction or _defaultSuccessDebugAction, + exceptionAction or _defaultExceptionDebugAction) + self.debug = True + return self + + def setDebug( self, flag=True ): + """Enable display of debugging messages while doing pattern matching. + Set C{flag} to True to enable, False to disable.""" + if flag: + self.setDebugActions( _defaultStartDebugAction, _defaultSuccessDebugAction, _defaultExceptionDebugAction ) + else: + self.debug = False + return self + + def __str__( self ): + return self.name + + def __repr__( self ): + return _ustr(self) + + def streamline( self ): + self.streamlined = True + self.strRepr = None + return self + + def checkRecursion( self, parseElementList ): + pass + + def validate( self, validateTrace=[] ): + """Check defined expressions for valid structure, check for infinite recursive definitions.""" + self.checkRecursion( [] ) + + def parseFile( self, file_or_filename, parseAll=False ): + """Execute the parse expression on the given file or filename. + If a filename is specified (instead of a file object), + the entire file is opened, read, and closed before parsing. + """ + try: + file_contents = file_or_filename.read() + except AttributeError: + f = open(file_or_filename, "r") + file_contents = f.read() + f.close() + try: + return self.parseString(file_contents, parseAll) + except ParseBaseException as exc: + if ParserElement.verbose_stacktrace: + raise + else: + # catch and re-raise exception from here, clears out pyparsing internal stack trace + raise exc + + def __eq__(self,other): + if isinstance(other, ParserElement): + return self is other or self.__dict__ == other.__dict__ + elif isinstance(other, basestring): + try: + self.parseString(_ustr(other), parseAll=True) + return True + except ParseBaseException: + return False + else: + return super(ParserElement,self)==other + + def __ne__(self,other): + return not (self == other) + + def __hash__(self): + return hash(id(self)) + + def __req__(self,other): + return self == other + + def __rne__(self,other): + return not (self == other) + + +class Token(ParserElement): + """Abstract C{ParserElement} subclass, for defining atomic matching patterns.""" + def __init__( self ): + super(Token,self).__init__( savelist=False ) + + def setName(self, name): + s = super(Token,self).setName(name) + self.errmsg = "Expected " + self.name + return s + + +class Empty(Token): + """An empty token, will always match.""" + def __init__( self ): + super(Empty,self).__init__() + self.name = "Empty" + self.mayReturnEmpty = True + self.mayIndexError = False + + +class NoMatch(Token): + """A token that will never match.""" + def __init__( self ): + super(NoMatch,self).__init__() + self.name = "NoMatch" + self.mayReturnEmpty = True + self.mayIndexError = False + self.errmsg = "Unmatchable token" + + def parseImpl( self, instring, loc, doActions=True ): + raise ParseException(instring, loc, self.errmsg, self) + + +class Literal(Token): + """Token to exactly match a specified string.""" + def __init__( self, matchString ): + super(Literal,self).__init__() + self.match = matchString + self.matchLen = len(matchString) + try: + self.firstMatchChar = matchString[0] + except IndexError: + warnings.warn("null string passed to Literal; use Empty() instead", + SyntaxWarning, stacklevel=2) + self.__class__ = Empty + self.name = '"%s"' % _ustr(self.match) + self.errmsg = "Expected " + self.name + self.mayReturnEmpty = False + self.mayIndexError = False + + # Performance tuning: this routine gets called a *lot* + # if this is a single character match string and the first character matches, + # short-circuit as quickly as possible, and avoid calling startswith + #~ @profile + def parseImpl( self, instring, loc, doActions=True ): + if (instring[loc] == self.firstMatchChar and + (self.matchLen==1 or instring.startswith(self.match,loc)) ): + return loc+self.matchLen, self.match + raise ParseException(instring, loc, self.errmsg, self) +_L = Literal +ParserElement.literalStringClass = Literal + +class Keyword(Token): + """Token to exactly match a specified string as a keyword, that is, it must be + immediately followed by a non-keyword character. Compare with C{L{Literal}}:: + Literal("if") will match the leading C{'if'} in C{'ifAndOnlyIf'}. + Keyword("if") will not; it will only match the leading C{'if'} in C{'if x=1'}, or C{'if(y==2)'} + Accepts two optional constructor arguments in addition to the keyword string: + C{identChars} is a string of characters that would be valid identifier characters, + defaulting to all alphanumerics + "_" and "$"; C{caseless} allows case-insensitive + matching, default is C{False}. + """ + DEFAULT_KEYWORD_CHARS = alphanums+"_$" + + def __init__( self, matchString, identChars=DEFAULT_KEYWORD_CHARS, caseless=False ): + super(Keyword,self).__init__() + self.match = matchString + self.matchLen = len(matchString) + try: + self.firstMatchChar = matchString[0] + except IndexError: + warnings.warn("null string passed to Keyword; use Empty() instead", + SyntaxWarning, stacklevel=2) + self.name = '"%s"' % self.match + self.errmsg = "Expected " + self.name + self.mayReturnEmpty = False + self.mayIndexError = False + self.caseless = caseless + if caseless: + self.caselessmatch = matchString.upper() + identChars = identChars.upper() + self.identChars = set(identChars) + + def parseImpl( self, instring, loc, doActions=True ): + if self.caseless: + if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and + (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) and + (loc == 0 or instring[loc-1].upper() not in self.identChars) ): + return loc+self.matchLen, self.match + else: + if (instring[loc] == self.firstMatchChar and + (self.matchLen==1 or instring.startswith(self.match,loc)) and + (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen] not in self.identChars) and + (loc == 0 or instring[loc-1] not in self.identChars) ): + return loc+self.matchLen, self.match + raise ParseException(instring, loc, self.errmsg, self) + + def copy(self): + c = super(Keyword,self).copy() + c.identChars = Keyword.DEFAULT_KEYWORD_CHARS + return c + + def setDefaultKeywordChars( chars ): + """Overrides the default Keyword chars + """ + Keyword.DEFAULT_KEYWORD_CHARS = chars + setDefaultKeywordChars = staticmethod(setDefaultKeywordChars) + +class CaselessLiteral(Literal): + """Token to match a specified string, ignoring case of letters. + Note: the matched results will always be in the case of the given + match string, NOT the case of the input text. + """ + def __init__( self, matchString ): + super(CaselessLiteral,self).__init__( matchString.upper() ) + # Preserve the defining literal. + self.returnString = matchString + self.name = "'%s'" % self.returnString + self.errmsg = "Expected " + self.name + + def parseImpl( self, instring, loc, doActions=True ): + if instring[ loc:loc+self.matchLen ].upper() == self.match: + return loc+self.matchLen, self.returnString + raise ParseException(instring, loc, self.errmsg, self) + +class CaselessKeyword(Keyword): + def __init__( self, matchString, identChars=Keyword.DEFAULT_KEYWORD_CHARS ): + super(CaselessKeyword,self).__init__( matchString, identChars, caseless=True ) + + def parseImpl( self, instring, loc, doActions=True ): + if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and + (loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) ): + return loc+self.matchLen, self.match + raise ParseException(instring, loc, self.errmsg, self) + +class Word(Token): + """Token for matching words composed of allowed character sets. + Defined with string containing all allowed initial characters, + an optional string containing allowed body characters (if omitted, + defaults to the initial character set), and an optional minimum, + maximum, and/or exact length. The default value for C{min} is 1 (a + minimum value < 1 is not valid); the default values for C{max} and C{exact} + are 0, meaning no maximum or exact length restriction. An optional + C{exclude} parameter can list characters that might be found in + the input C{bodyChars} string; useful to define a word of all printables + except for one or two characters, for instance. + """ + def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False, excludeChars=None ): + super(Word,self).__init__() + if excludeChars: + initChars = ''.join(c for c in initChars if c not in excludeChars) + if bodyChars: + bodyChars = ''.join(c for c in bodyChars if c not in excludeChars) + self.initCharsOrig = initChars + self.initChars = set(initChars) + if bodyChars : + self.bodyCharsOrig = bodyChars + self.bodyChars = set(bodyChars) + else: + self.bodyCharsOrig = initChars + self.bodyChars = set(initChars) + + self.maxSpecified = max > 0 + + if min < 1: + raise ValueError("cannot specify a minimum length < 1; use Optional(Word()) if zero-length word is permitted") + + self.minLen = min + + if max > 0: + self.maxLen = max + else: + self.maxLen = _MAX_INT + + if exact > 0: + self.maxLen = exact + self.minLen = exact + + self.name = _ustr(self) + self.errmsg = "Expected " + self.name + self.mayIndexError = False + self.asKeyword = asKeyword + + if ' ' not in self.initCharsOrig+self.bodyCharsOrig and (min==1 and max==0 and exact==0): + if self.bodyCharsOrig == self.initCharsOrig: + self.reString = "[%s]+" % _escapeRegexRangeChars(self.initCharsOrig) + elif len(self.bodyCharsOrig) == 1: + self.reString = "%s[%s]*" % \ + (re.escape(self.initCharsOrig), + _escapeRegexRangeChars(self.bodyCharsOrig),) + else: + self.reString = "[%s][%s]*" % \ + (_escapeRegexRangeChars(self.initCharsOrig), + _escapeRegexRangeChars(self.bodyCharsOrig),) + if self.asKeyword: + self.reString = r"\b"+self.reString+r"\b" + try: + self.re = re.compile( self.reString ) + except: + self.re = None + + def parseImpl( self, instring, loc, doActions=True ): + if self.re: + result = self.re.match(instring,loc) + if not result: + raise ParseException(instring, loc, self.errmsg, self) + + loc = result.end() + return loc, result.group() + + if not(instring[ loc ] in self.initChars): + raise ParseException(instring, loc, self.errmsg, self) + + start = loc + loc += 1 + instrlen = len(instring) + bodychars = self.bodyChars + maxloc = start + self.maxLen + maxloc = min( maxloc, instrlen ) + while loc < maxloc and instring[loc] in bodychars: + loc += 1 + + throwException = False + if loc - start < self.minLen: + throwException = True + if self.maxSpecified and loc < instrlen and instring[loc] in bodychars: + throwException = True + if self.asKeyword: + if (start>0 and instring[start-1] in bodychars) or (loc4: + return s[:4]+"..." + else: + return s + + if ( self.initCharsOrig != self.bodyCharsOrig ): + self.strRepr = "W:(%s,%s)" % ( charsAsStr(self.initCharsOrig), charsAsStr(self.bodyCharsOrig) ) + else: + self.strRepr = "W:(%s)" % charsAsStr(self.initCharsOrig) + + return self.strRepr + + +class Regex(Token): + """Token for matching strings that match a given regular expression. + Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module. + """ + compiledREtype = type(re.compile("[A-Z]")) + def __init__( self, pattern, flags=0): + """The parameters C{pattern} and C{flags} are passed to the C{re.compile()} function as-is. See the Python C{re} module for an explanation of the acceptable patterns and flags.""" + super(Regex,self).__init__() + + if isinstance(pattern, basestring): + if len(pattern) == 0: + warnings.warn("null string passed to Regex; use Empty() instead", + SyntaxWarning, stacklevel=2) + + self.pattern = pattern + self.flags = flags + + try: + self.re = re.compile(self.pattern, self.flags) + self.reString = self.pattern + except sre_constants.error: + warnings.warn("invalid pattern (%s) passed to Regex" % pattern, + SyntaxWarning, stacklevel=2) + raise + + elif isinstance(pattern, Regex.compiledREtype): + self.re = pattern + self.pattern = \ + self.reString = str(pattern) + self.flags = flags + + else: + raise ValueError("Regex may only be constructed with a string or a compiled RE object") + + self.name = _ustr(self) + self.errmsg = "Expected " + self.name + self.mayIndexError = False + self.mayReturnEmpty = True + + def parseImpl( self, instring, loc, doActions=True ): + result = self.re.match(instring,loc) + if not result: + raise ParseException(instring, loc, self.errmsg, self) + + loc = result.end() + d = result.groupdict() + ret = ParseResults(result.group()) + if d: + for k in d: + ret[k] = d[k] + return loc,ret + + def __str__( self ): + try: + return super(Regex,self).__str__() + except: + pass + + if self.strRepr is None: + self.strRepr = "Re:(%s)" % repr(self.pattern) + + return self.strRepr + + +class QuotedString(Token): + """Token for matching strings that are delimited by quoting characters. + """ + def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unquoteResults=True, endQuoteChar=None): + """ + Defined with the following parameters: + - quoteChar - string of one or more characters defining the quote delimiting string + - escChar - character to escape quotes, typically backslash (default=None) + - escQuote - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=None) + - multiline - boolean indicating whether quotes can span multiple lines (default=C{False}) + - unquoteResults - boolean indicating whether the matched text should be unquoted (default=C{True}) + - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=C{None} => same as quoteChar) + """ + super(QuotedString,self).__init__() + + # remove white space from quote chars - wont work anyway + quoteChar = quoteChar.strip() + if len(quoteChar) == 0: + warnings.warn("quoteChar cannot be the empty string",SyntaxWarning,stacklevel=2) + raise SyntaxError() + + if endQuoteChar is None: + endQuoteChar = quoteChar + else: + endQuoteChar = endQuoteChar.strip() + if len(endQuoteChar) == 0: + warnings.warn("endQuoteChar cannot be the empty string",SyntaxWarning,stacklevel=2) + raise SyntaxError() + + self.quoteChar = quoteChar + self.quoteCharLen = len(quoteChar) + self.firstQuoteChar = quoteChar[0] + self.endQuoteChar = endQuoteChar + self.endQuoteCharLen = len(endQuoteChar) + self.escChar = escChar + self.escQuote = escQuote + self.unquoteResults = unquoteResults + + if multiline: + self.flags = re.MULTILINE | re.DOTALL + self.pattern = r'%s(?:[^%s%s]' % \ + ( re.escape(self.quoteChar), + _escapeRegexRangeChars(self.endQuoteChar[0]), + (escChar is not None and _escapeRegexRangeChars(escChar) or '') ) + else: + self.flags = 0 + self.pattern = r'%s(?:[^%s\n\r%s]' % \ + ( re.escape(self.quoteChar), + _escapeRegexRangeChars(self.endQuoteChar[0]), + (escChar is not None and _escapeRegexRangeChars(escChar) or '') ) + if len(self.endQuoteChar) > 1: + self.pattern += ( + '|(?:' + ')|(?:'.join("%s[^%s]" % (re.escape(self.endQuoteChar[:i]), + _escapeRegexRangeChars(self.endQuoteChar[i])) + for i in range(len(self.endQuoteChar)-1,0,-1)) + ')' + ) + if escQuote: + self.pattern += (r'|(?:%s)' % re.escape(escQuote)) + if escChar: + self.pattern += (r'|(?:%s.)' % re.escape(escChar)) + self.escCharReplacePattern = re.escape(self.escChar)+"(.)" + self.pattern += (r')*%s' % re.escape(self.endQuoteChar)) + + try: + self.re = re.compile(self.pattern, self.flags) + self.reString = self.pattern + except sre_constants.error: + warnings.warn("invalid pattern (%s) passed to Regex" % self.pattern, + SyntaxWarning, stacklevel=2) + raise + + self.name = _ustr(self) + self.errmsg = "Expected " + self.name + self.mayIndexError = False + self.mayReturnEmpty = True + + def parseImpl( self, instring, loc, doActions=True ): + result = instring[loc] == self.firstQuoteChar and self.re.match(instring,loc) or None + if not result: + raise ParseException(instring, loc, self.errmsg, self) + + loc = result.end() + ret = result.group() + + if self.unquoteResults: + + # strip off quotes + ret = ret[self.quoteCharLen:-self.endQuoteCharLen] + + if isinstance(ret,basestring): + # replace escaped characters + if self.escChar: + ret = re.sub(self.escCharReplacePattern,"\g<1>",ret) + + # replace escaped quotes + if self.escQuote: + ret = ret.replace(self.escQuote, self.endQuoteChar) + + return loc, ret + + def __str__( self ): + try: + return super(QuotedString,self).__str__() + except: + pass + + if self.strRepr is None: + self.strRepr = "quoted string, starting with %s ending with %s" % (self.quoteChar, self.endQuoteChar) + + return self.strRepr + + +class CharsNotIn(Token): + """Token for matching words composed of characters *not* in a given set. + Defined with string containing all disallowed characters, and an optional + minimum, maximum, and/or exact length. The default value for C{min} is 1 (a + minimum value < 1 is not valid); the default values for C{max} and C{exact} + are 0, meaning no maximum or exact length restriction. + """ + def __init__( self, notChars, min=1, max=0, exact=0 ): + super(CharsNotIn,self).__init__() + self.skipWhitespace = False + self.notChars = notChars + + if min < 1: + raise ValueError("cannot specify a minimum length < 1; use Optional(CharsNotIn()) if zero-length char group is permitted") + + self.minLen = min + + if max > 0: + self.maxLen = max + else: + self.maxLen = _MAX_INT + + if exact > 0: + self.maxLen = exact + self.minLen = exact + + self.name = _ustr(self) + self.errmsg = "Expected " + self.name + self.mayReturnEmpty = ( self.minLen == 0 ) + self.mayIndexError = False + + def parseImpl( self, instring, loc, doActions=True ): + if instring[loc] in self.notChars: + raise ParseException(instring, loc, self.errmsg, self) + + start = loc + loc += 1 + notchars = self.notChars + maxlen = min( start+self.maxLen, len(instring) ) + while loc < maxlen and \ + (instring[loc] not in notchars): + loc += 1 + + if loc - start < self.minLen: + raise ParseException(instring, loc, self.errmsg, self) + + return loc, instring[start:loc] + + def __str__( self ): + try: + return super(CharsNotIn, self).__str__() + except: + pass + + if self.strRepr is None: + if len(self.notChars) > 4: + self.strRepr = "!W:(%s...)" % self.notChars[:4] + else: + self.strRepr = "!W:(%s)" % self.notChars + + return self.strRepr + +class White(Token): + """Special matching class for matching whitespace. Normally, whitespace is ignored + by pyparsing grammars. This class is included when some whitespace structures + are significant. Define with a string containing the whitespace characters to be + matched; default is C{" \\t\\r\\n"}. Also takes optional C{min}, C{max}, and C{exact} arguments, + as defined for the C{L{Word}} class.""" + whiteStrs = { + " " : "", + "\t": "", + "\n": "", + "\r": "", + "\f": "", + } + def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): + super(White,self).__init__() + self.matchWhite = ws + self.setWhitespaceChars( "".join(c for c in self.whiteChars if c not in self.matchWhite) ) + #~ self.leaveWhitespace() + self.name = ("".join(White.whiteStrs[c] for c in self.matchWhite)) + self.mayReturnEmpty = True + self.errmsg = "Expected " + self.name + + self.minLen = min + + if max > 0: + self.maxLen = max + else: + self.maxLen = _MAX_INT + + if exact > 0: + self.maxLen = exact + self.minLen = exact + + def parseImpl( self, instring, loc, doActions=True ): + if not(instring[ loc ] in self.matchWhite): + raise ParseException(instring, loc, self.errmsg, self) + start = loc + loc += 1 + maxloc = start + self.maxLen + maxloc = min( maxloc, len(instring) ) + while loc < maxloc and instring[loc] in self.matchWhite: + loc += 1 + + if loc - start < self.minLen: + raise ParseException(instring, loc, self.errmsg, self) + + return loc, instring[start:loc] + + +class _PositionToken(Token): + def __init__( self ): + super(_PositionToken,self).__init__() + self.name=self.__class__.__name__ + self.mayReturnEmpty = True + self.mayIndexError = False + +class GoToColumn(_PositionToken): + """Token to advance to a specific column of input text; useful for tabular report scraping.""" + def __init__( self, colno ): + super(GoToColumn,self).__init__() + self.col = colno + + def preParse( self, instring, loc ): + if col(loc,instring) != self.col: + instrlen = len(instring) + if self.ignoreExprs: + loc = self._skipIgnorables( instring, loc ) + while loc < instrlen and instring[loc].isspace() and col( loc, instring ) != self.col : + loc += 1 + return loc + + def parseImpl( self, instring, loc, doActions=True ): + thiscol = col( loc, instring ) + if thiscol > self.col: + raise ParseException( instring, loc, "Text not in expected column", self ) + newloc = loc + self.col - thiscol + ret = instring[ loc: newloc ] + return newloc, ret + +class LineStart(_PositionToken): + """Matches if current position is at the beginning of a line within the parse string""" + def __init__( self ): + super(LineStart,self).__init__() + self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) + self.errmsg = "Expected start of line" + + def preParse( self, instring, loc ): + preloc = super(LineStart,self).preParse(instring,loc) + if instring[preloc] == "\n": + loc += 1 + return loc + + def parseImpl( self, instring, loc, doActions=True ): + if not( loc==0 or + (loc == self.preParse( instring, 0 )) or + (instring[loc-1] == "\n") ): #col(loc, instring) != 1: + raise ParseException(instring, loc, self.errmsg, self) + return loc, [] + +class LineEnd(_PositionToken): + """Matches if current position is at the end of a line within the parse string""" + def __init__( self ): + super(LineEnd,self).__init__() + self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") ) + self.errmsg = "Expected end of line" + + def parseImpl( self, instring, loc, doActions=True ): + if loc len(instring): + return loc, [] + else: + raise ParseException(instring, loc, self.errmsg, self) + +class WordStart(_PositionToken): + """Matches if the current position is at the beginning of a Word, and + is not preceded by any character in a given set of C{wordChars} + (default=C{printables}). To emulate the C{\b} behavior of regular expressions, + use C{WordStart(alphanums)}. C{WordStart} will also match at the beginning of + the string being parsed, or at the beginning of a line. + """ + def __init__(self, wordChars = printables): + super(WordStart,self).__init__() + self.wordChars = set(wordChars) + self.errmsg = "Not at the start of a word" + + def parseImpl(self, instring, loc, doActions=True ): + if loc != 0: + if (instring[loc-1] in self.wordChars or + instring[loc] not in self.wordChars): + raise ParseException(instring, loc, self.errmsg, self) + return loc, [] + +class WordEnd(_PositionToken): + """Matches if the current position is at the end of a Word, and + is not followed by any character in a given set of C{wordChars} + (default=C{printables}). To emulate the C{\b} behavior of regular expressions, + use C{WordEnd(alphanums)}. C{WordEnd} will also match at the end of + the string being parsed, or at the end of a line. + """ + def __init__(self, wordChars = printables): + super(WordEnd,self).__init__() + self.wordChars = set(wordChars) + self.skipWhitespace = False + self.errmsg = "Not at the end of a word" + + def parseImpl(self, instring, loc, doActions=True ): + instrlen = len(instring) + if instrlen>0 and loc maxExcLoc: + maxException = err + maxExcLoc = err.loc + except IndexError: + if len(instring) > maxExcLoc: + maxException = ParseException(instring,len(instring),e.errmsg,self) + maxExcLoc = len(instring) + else: + if loc2 > maxMatchLoc: + maxMatchLoc = loc2 + maxMatchExp = e + + if maxMatchLoc < 0: + if maxException is not None: + raise maxException + else: + raise ParseException(instring, loc, "no defined alternatives to match", self) + + return maxMatchExp._parse( instring, loc, doActions ) + + def __ixor__(self, other ): + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + return self.append( other ) #Or( [ self, other ] ) + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "{" + " ^ ".join(_ustr(e) for e in self.exprs) + "}" + + return self.strRepr + + def checkRecursion( self, parseElementList ): + subRecCheckList = parseElementList[:] + [ self ] + for e in self.exprs: + e.checkRecursion( subRecCheckList ) + + +class MatchFirst(ParseExpression): + """Requires that at least one C{ParseExpression} is found. + If two expressions match, the first one listed is the one that will match. + May be constructed using the C{'|'} operator. + """ + def __init__( self, exprs, savelist = False ): + super(MatchFirst,self).__init__(exprs, savelist) + if self.exprs: + self.mayReturnEmpty = any(e.mayReturnEmpty for e in self.exprs) + else: + self.mayReturnEmpty = True + + def parseImpl( self, instring, loc, doActions=True ): + maxExcLoc = -1 + maxException = None + for e in self.exprs: + try: + ret = e._parse( instring, loc, doActions ) + return ret + except ParseException as err: + if err.loc > maxExcLoc: + maxException = err + maxExcLoc = err.loc + except IndexError: + if len(instring) > maxExcLoc: + maxException = ParseException(instring,len(instring),e.errmsg,self) + maxExcLoc = len(instring) + + # only got here if no expression matched, raise exception for match that made it the furthest + else: + if maxException is not None: + raise maxException + else: + raise ParseException(instring, loc, "no defined alternatives to match", self) + + def __ior__(self, other ): + if isinstance( other, basestring ): + other = ParserElement.literalStringClass( other ) + return self.append( other ) #MatchFirst( [ self, other ] ) + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "{" + " | ".join(_ustr(e) for e in self.exprs) + "}" + + return self.strRepr + + def checkRecursion( self, parseElementList ): + subRecCheckList = parseElementList[:] + [ self ] + for e in self.exprs: + e.checkRecursion( subRecCheckList ) + + +class Each(ParseExpression): + """Requires all given C{ParseExpression}s to be found, but in any order. + Expressions may be separated by whitespace. + May be constructed using the C{'&'} operator. + """ + def __init__( self, exprs, savelist = True ): + super(Each,self).__init__(exprs, savelist) + self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs) + self.skipWhitespace = True + self.initExprGroups = True + + def parseImpl( self, instring, loc, doActions=True ): + if self.initExprGroups: + opt1 = [ e.expr for e in self.exprs if isinstance(e,Optional) ] + opt2 = [ e for e in self.exprs if e.mayReturnEmpty and e not in opt1 ] + self.optionals = opt1 + opt2 + self.multioptionals = [ e.expr for e in self.exprs if isinstance(e,ZeroOrMore) ] + self.multirequired = [ e.expr for e in self.exprs if isinstance(e,OneOrMore) ] + self.required = [ e for e in self.exprs if not isinstance(e,(Optional,ZeroOrMore,OneOrMore)) ] + self.required += self.multirequired + self.initExprGroups = False + tmpLoc = loc + tmpReqd = self.required[:] + tmpOpt = self.optionals[:] + matchOrder = [] + + keepMatching = True + while keepMatching: + tmpExprs = tmpReqd + tmpOpt + self.multioptionals + self.multirequired + failed = [] + for e in tmpExprs: + try: + tmpLoc = e.tryParse( instring, tmpLoc ) + except ParseException: + failed.append(e) + else: + matchOrder.append(e) + if e in tmpReqd: + tmpReqd.remove(e) + elif e in tmpOpt: + tmpOpt.remove(e) + if len(failed) == len(tmpExprs): + keepMatching = False + + if tmpReqd: + missing = ", ".join(_ustr(e) for e in tmpReqd) + raise ParseException(instring,loc,"Missing one or more required elements (%s)" % missing ) + + # add any unmatched Optionals, in case they have default values defined + matchOrder += [e for e in self.exprs if isinstance(e,Optional) and e.expr in tmpOpt] + + resultlist = [] + for e in matchOrder: + loc,results = e._parse(instring,loc,doActions) + resultlist.append(results) + + finalResults = ParseResults([]) + for r in resultlist: + dups = {} + for k in r.keys(): + if k in finalResults: + tmp = ParseResults(finalResults[k]) + tmp += ParseResults(r[k]) + dups[k] = tmp + finalResults += ParseResults(r) + for k,v in dups.items(): + finalResults[k] = v + return loc, finalResults + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "{" + " & ".join(_ustr(e) for e in self.exprs) + "}" + + return self.strRepr + + def checkRecursion( self, parseElementList ): + subRecCheckList = parseElementList[:] + [ self ] + for e in self.exprs: + e.checkRecursion( subRecCheckList ) + + +class ParseElementEnhance(ParserElement): + """Abstract subclass of C{ParserElement}, for combining and post-processing parsed tokens.""" + def __init__( self, expr, savelist=False ): + super(ParseElementEnhance,self).__init__(savelist) + if isinstance( expr, basestring ): + expr = Literal(expr) + self.expr = expr + self.strRepr = None + if expr is not None: + self.mayIndexError = expr.mayIndexError + self.mayReturnEmpty = expr.mayReturnEmpty + self.setWhitespaceChars( expr.whiteChars ) + self.skipWhitespace = expr.skipWhitespace + self.saveAsList = expr.saveAsList + self.callPreparse = expr.callPreparse + self.ignoreExprs.extend(expr.ignoreExprs) + + def parseImpl( self, instring, loc, doActions=True ): + if self.expr is not None: + return self.expr._parse( instring, loc, doActions, callPreParse=False ) + else: + raise ParseException("",loc,self.errmsg,self) + + def leaveWhitespace( self ): + self.skipWhitespace = False + self.expr = self.expr.copy() + if self.expr is not None: + self.expr.leaveWhitespace() + return self + + def ignore( self, other ): + if isinstance( other, Suppress ): + if other not in self.ignoreExprs: + super( ParseElementEnhance, self).ignore( other ) + if self.expr is not None: + self.expr.ignore( self.ignoreExprs[-1] ) + else: + super( ParseElementEnhance, self).ignore( other ) + if self.expr is not None: + self.expr.ignore( self.ignoreExprs[-1] ) + return self + + def streamline( self ): + super(ParseElementEnhance,self).streamline() + if self.expr is not None: + self.expr.streamline() + return self + + def checkRecursion( self, parseElementList ): + if self in parseElementList: + raise RecursiveGrammarException( parseElementList+[self] ) + subRecCheckList = parseElementList[:] + [ self ] + if self.expr is not None: + self.expr.checkRecursion( subRecCheckList ) + + def validate( self, validateTrace=[] ): + tmp = validateTrace[:]+[self] + if self.expr is not None: + self.expr.validate(tmp) + self.checkRecursion( [] ) + + def __str__( self ): + try: + return super(ParseElementEnhance,self).__str__() + except: + pass + + if self.strRepr is None and self.expr is not None: + self.strRepr = "%s:(%s)" % ( self.__class__.__name__, _ustr(self.expr) ) + return self.strRepr + + +class FollowedBy(ParseElementEnhance): + """Lookahead matching of the given parse expression. C{FollowedBy} + does *not* advance the parsing position within the input string, it only + verifies that the specified parse expression matches at the current + position. C{FollowedBy} always returns a null token list.""" + def __init__( self, expr ): + super(FollowedBy,self).__init__(expr) + self.mayReturnEmpty = True + + def parseImpl( self, instring, loc, doActions=True ): + self.expr.tryParse( instring, loc ) + return loc, [] + + +class NotAny(ParseElementEnhance): + """Lookahead to disallow matching with the given parse expression. C{NotAny} + does *not* advance the parsing position within the input string, it only + verifies that the specified parse expression does *not* match at the current + position. Also, C{NotAny} does *not* skip over leading whitespace. C{NotAny} + always returns a null token list. May be constructed using the '~' operator.""" + def __init__( self, expr ): + super(NotAny,self).__init__(expr) + #~ self.leaveWhitespace() + self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs + self.mayReturnEmpty = True + self.errmsg = "Found unwanted token, "+_ustr(self.expr) + + def parseImpl( self, instring, loc, doActions=True ): + try: + self.expr.tryParse( instring, loc ) + except (ParseException,IndexError): + pass + else: + raise ParseException(instring, loc, self.errmsg, self) + return loc, [] + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "~{" + _ustr(self.expr) + "}" + + return self.strRepr + + +class ZeroOrMore(ParseElementEnhance): + """Optional repetition of zero or more of the given expression.""" + def __init__( self, expr ): + super(ZeroOrMore,self).__init__(expr) + self.mayReturnEmpty = True + + def parseImpl( self, instring, loc, doActions=True ): + tokens = [] + try: + loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) + hasIgnoreExprs = ( len(self.ignoreExprs) > 0 ) + while 1: + if hasIgnoreExprs: + preloc = self._skipIgnorables( instring, loc ) + else: + preloc = loc + loc, tmptokens = self.expr._parse( instring, preloc, doActions ) + if tmptokens or tmptokens.haskeys(): + tokens += tmptokens + except (ParseException,IndexError): + pass + + return loc, tokens + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "[" + _ustr(self.expr) + "]..." + + return self.strRepr + + def setResultsName( self, name, listAllMatches=False ): + ret = super(ZeroOrMore,self).setResultsName(name,listAllMatches) + ret.saveAsList = True + return ret + + +class OneOrMore(ParseElementEnhance): + """Repetition of one or more of the given expression.""" + def parseImpl( self, instring, loc, doActions=True ): + # must be at least one + loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) + try: + hasIgnoreExprs = ( len(self.ignoreExprs) > 0 ) + while 1: + if hasIgnoreExprs: + preloc = self._skipIgnorables( instring, loc ) + else: + preloc = loc + loc, tmptokens = self.expr._parse( instring, preloc, doActions ) + if tmptokens or tmptokens.haskeys(): + tokens += tmptokens + except (ParseException,IndexError): + pass + + return loc, tokens + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "{" + _ustr(self.expr) + "}..." + + return self.strRepr + + def setResultsName( self, name, listAllMatches=False ): + ret = super(OneOrMore,self).setResultsName(name,listAllMatches) + ret.saveAsList = True + return ret + +class _NullToken(object): + def __bool__(self): + return False + __nonzero__ = __bool__ + def __str__(self): + return "" + +_optionalNotMatched = _NullToken() +class Optional(ParseElementEnhance): + """Optional matching of the given expression. + A default return string can also be specified, if the optional expression + is not found. + """ + def __init__( self, expr, default=_optionalNotMatched ): + super(Optional,self).__init__( expr, savelist=False ) + self.defaultValue = default + self.mayReturnEmpty = True + + def parseImpl( self, instring, loc, doActions=True ): + try: + loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False ) + except (ParseException,IndexError): + if self.defaultValue is not _optionalNotMatched: + if self.expr.resultsName: + tokens = ParseResults([ self.defaultValue ]) + tokens[self.expr.resultsName] = self.defaultValue + else: + tokens = [ self.defaultValue ] + else: + tokens = [] + return loc, tokens + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + if self.strRepr is None: + self.strRepr = "[" + _ustr(self.expr) + "]" + + return self.strRepr + + +class SkipTo(ParseElementEnhance): + """Token for skipping over all undefined text until the matched expression is found. + If C{include} is set to true, the matched expression is also parsed (the skipped text + and matched expression are returned as a 2-element list). The C{ignore} + argument is used to define grammars (typically quoted strings and comments) that + might contain false matches. + """ + def __init__( self, other, include=False, ignore=None, failOn=None ): + super( SkipTo, self ).__init__( other ) + self.ignoreExpr = ignore + self.mayReturnEmpty = True + self.mayIndexError = False + self.includeMatch = include + self.asList = False + if failOn is not None and isinstance(failOn, basestring): + self.failOn = Literal(failOn) + else: + self.failOn = failOn + self.errmsg = "No match found for "+_ustr(self.expr) + + def parseImpl( self, instring, loc, doActions=True ): + startLoc = loc + instrlen = len(instring) + expr = self.expr + failParse = False + while loc <= instrlen: + try: + if self.failOn: + try: + self.failOn.tryParse(instring, loc) + except ParseBaseException: + pass + else: + failParse = True + raise ParseException(instring, loc, "Found expression " + str(self.failOn)) + failParse = False + if self.ignoreExpr is not None: + while 1: + try: + loc = self.ignoreExpr.tryParse(instring,loc) + # print("found ignoreExpr, advance to", loc) + except ParseBaseException: + break + expr._parse( instring, loc, doActions=False, callPreParse=False ) + skipText = instring[startLoc:loc] + if self.includeMatch: + loc,mat = expr._parse(instring,loc,doActions,callPreParse=False) + if mat: + skipRes = ParseResults( skipText ) + skipRes += mat + return loc, [ skipRes ] + else: + return loc, [ skipText ] + else: + return loc, [ skipText ] + except (ParseException,IndexError): + if failParse: + raise + else: + loc += 1 + raise ParseException(instring, loc, self.errmsg, self) + +class Forward(ParseElementEnhance): + """Forward declaration of an expression to be defined later - + used for recursive grammars, such as algebraic infix notation. + When the expression is known, it is assigned to the C{Forward} variable using the '<<' operator. + + Note: take care when assigning to C{Forward} not to overlook precedence of operators. + Specifically, '|' has a lower precedence than '<<', so that:: + fwdExpr << a | b | c + will actually be evaluated as:: + (fwdExpr << a) | b | c + thereby leaving b and c out as parseable alternatives. It is recommended that you + explicitly group the values inserted into the C{Forward}:: + fwdExpr << (a | b | c) + Converting to use the '<<=' operator instead will avoid this problem. + """ + def __init__( self, other=None ): + super(Forward,self).__init__( other, savelist=False ) + + def __lshift__( self, other ): + if isinstance( other, basestring ): + other = ParserElement.literalStringClass(other) + self.expr = other + self.mayReturnEmpty = other.mayReturnEmpty + self.strRepr = None + self.mayIndexError = self.expr.mayIndexError + self.mayReturnEmpty = self.expr.mayReturnEmpty + self.setWhitespaceChars( self.expr.whiteChars ) + self.skipWhitespace = self.expr.skipWhitespace + self.saveAsList = self.expr.saveAsList + self.ignoreExprs.extend(self.expr.ignoreExprs) + return self + + def __ilshift__(self, other): + return self << other + + def leaveWhitespace( self ): + self.skipWhitespace = False + return self + + def streamline( self ): + if not self.streamlined: + self.streamlined = True + if self.expr is not None: + self.expr.streamline() + return self + + def validate( self, validateTrace=[] ): + if self not in validateTrace: + tmp = validateTrace[:]+[self] + if self.expr is not None: + self.expr.validate(tmp) + self.checkRecursion([]) + + def __str__( self ): + if hasattr(self,"name"): + return self.name + + self._revertClass = self.__class__ + self.__class__ = _ForwardNoRecurse + try: + if self.expr is not None: + retString = _ustr(self.expr) + else: + retString = "None" + finally: + self.__class__ = self._revertClass + return self.__class__.__name__ + ": " + retString + + def copy(self): + if self.expr is not None: + return super(Forward,self).copy() + else: + ret = Forward() + ret <<= self + return ret + +class _ForwardNoRecurse(Forward): + def __str__( self ): + return "..." + +class TokenConverter(ParseElementEnhance): + """Abstract subclass of C{ParseExpression}, for converting parsed results.""" + def __init__( self, expr, savelist=False ): + super(TokenConverter,self).__init__( expr )#, savelist ) + self.saveAsList = False + +class Upcase(TokenConverter): + """Converter to upper case all matching tokens.""" + def __init__(self, *args): + super(Upcase,self).__init__(*args) + warnings.warn("Upcase class is deprecated, use upcaseTokens parse action instead", + DeprecationWarning,stacklevel=2) + + def postParse( self, instring, loc, tokenlist ): + return list(map( str.upper, tokenlist )) + + +class Combine(TokenConverter): + """Converter to concatenate all matching tokens to a single string. + By default, the matching patterns must also be contiguous in the input string; + this can be disabled by specifying C{'adjacent=False'} in the constructor. + """ + def __init__( self, expr, joinString="", adjacent=True ): + super(Combine,self).__init__( expr ) + # suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself + if adjacent: + self.leaveWhitespace() + self.adjacent = adjacent + self.skipWhitespace = True + self.joinString = joinString + self.callPreparse = True + + def ignore( self, other ): + if self.adjacent: + ParserElement.ignore(self, other) + else: + super( Combine, self).ignore( other ) + return self + + def postParse( self, instring, loc, tokenlist ): + retToks = tokenlist.copy() + del retToks[:] + retToks += ParseResults([ "".join(tokenlist._asStringList(self.joinString)) ], modal=self.modalResults) + + if self.resultsName and retToks.haskeys(): + return [ retToks ] + else: + return retToks + +class Group(TokenConverter): + """Converter to return the matched tokens as a list - useful for returning tokens of C{L{ZeroOrMore}} and C{L{OneOrMore}} expressions.""" + def __init__( self, expr ): + super(Group,self).__init__( expr ) + self.saveAsList = True + + def postParse( self, instring, loc, tokenlist ): + return [ tokenlist ] + +class Dict(TokenConverter): + """Converter to return a repetitive expression as a list, but also as a dictionary. + Each element can also be referenced using the first token in the expression as its key. + Useful for tabular report scraping when the first column can be used as a item key. + """ + def __init__( self, expr ): + super(Dict,self).__init__( expr ) + self.saveAsList = True + + def postParse( self, instring, loc, tokenlist ): + for i,tok in enumerate(tokenlist): + if len(tok) == 0: + continue + ikey = tok[0] + if isinstance(ikey,int): + ikey = _ustr(tok[0]).strip() + if len(tok)==1: + tokenlist[ikey] = _ParseResultsWithOffset("",i) + elif len(tok)==2 and not isinstance(tok[1],ParseResults): + tokenlist[ikey] = _ParseResultsWithOffset(tok[1],i) + else: + dictvalue = tok.copy() #ParseResults(i) + del dictvalue[0] + if len(dictvalue)!= 1 or (isinstance(dictvalue,ParseResults) and dictvalue.haskeys()): + tokenlist[ikey] = _ParseResultsWithOffset(dictvalue,i) + else: + tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0],i) + + if self.resultsName: + return [ tokenlist ] + else: + return tokenlist + + +class Suppress(TokenConverter): + """Converter for ignoring the results of a parsed expression.""" + def postParse( self, instring, loc, tokenlist ): + return [] + + def suppress( self ): + return self + + +class OnlyOnce(object): + """Wrapper for parse actions, to ensure they are only called once.""" + def __init__(self, methodCall): + self.callable = _trim_arity(methodCall) + self.called = False + def __call__(self,s,l,t): + if not self.called: + results = self.callable(s,l,t) + self.called = True + return results + raise ParseException(s,l,"") + def reset(self): + self.called = False + +def traceParseAction(f): + """Decorator for debugging parse actions.""" + f = _trim_arity(f) + def z(*paArgs): + thisFunc = f.func_name + s,l,t = paArgs[-3:] + if len(paArgs)>3: + thisFunc = paArgs[0].__class__.__name__ + '.' + thisFunc + sys.stderr.write( ">>entering %s(line: '%s', %d, %s)\n" % (thisFunc,line(l,s),l,t) ) + try: + ret = f(*paArgs) + except Exception as exc: + sys.stderr.write( "<", "|".join( [ _escapeRegexChars(sym) for sym in symbols] )) + try: + if len(symbols)==len("".join(symbols)): + return Regex( "[%s]" % "".join(_escapeRegexRangeChars(sym) for sym in symbols) ) + else: + return Regex( "|".join(re.escape(sym) for sym in symbols) ) + except: + warnings.warn("Exception creating Regex for oneOf, building MatchFirst", + SyntaxWarning, stacklevel=2) + + + # last resort, just use MatchFirst + return MatchFirst( [ parseElementClass(sym) for sym in symbols ] ) + +def dictOf( key, value ): + """Helper to easily and clearly define a dictionary by specifying the respective patterns + for the key and value. Takes care of defining the C{L{Dict}}, C{L{ZeroOrMore}}, and C{L{Group}} tokens + in the proper order. The key pattern can include delimiting markers or punctuation, + as long as they are suppressed, thereby leaving the significant key text. The value + pattern can include named results, so that the C{Dict} results can include named token + fields. + """ + return Dict( ZeroOrMore( Group ( key + value ) ) ) + +def originalTextFor(expr, asString=True): + """Helper to return the original, untokenized text for a given expression. Useful to + restore the parsed fields of an HTML start tag into the raw tag text itself, or to + revert separate tokens with intervening whitespace back to the original matching + input text. Simpler to use than the parse action C{L{keepOriginalText}}, and does not + require the inspect module to chase up the call stack. By default, returns a + string containing the original parsed text. + + If the optional C{asString} argument is passed as C{False}, then the return value is a + C{L{ParseResults}} containing any results names that were originally matched, and a + single token containing the original matched text from the input string. So if + the expression passed to C{L{originalTextFor}} contains expressions with defined + results names, you must set C{asString} to C{False} if you want to preserve those + results name values.""" + locMarker = Empty().setParseAction(lambda s,loc,t: loc) + endlocMarker = locMarker.copy() + endlocMarker.callPreparse = False + matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end") + if asString: + extractText = lambda s,l,t: s[t._original_start:t._original_end] + else: + def extractText(s,l,t): + del t[:] + t.insert(0, s[t._original_start:t._original_end]) + del t["_original_start"] + del t["_original_end"] + matchExpr.setParseAction(extractText) + return matchExpr + +def ungroup(expr): + """Helper to undo pyparsing's default grouping of And expressions, even + if all but one are non-empty.""" + return TokenConverter(expr).setParseAction(lambda t:t[0]) + +def locatedExpr(expr): + """Helper to decorate a returned token with its starting and ending locations in the input string. + This helper adds the following results names: + - locn_start = location where matched expression begins + - locn_end = location where matched expression ends + - value = the actual parsed results + + Be careful if the input text contains C{} characters, you may want to call + C{L{ParserElement.parseWithTabs}} + """ + locator = Empty().setParseAction(lambda s,l,t: l) + return Group(locator("locn_start") + expr("value") + locator.copy().leaveWhitespace()("locn_end")) + + +# convenience constants for positional expressions +empty = Empty().setName("empty") +lineStart = LineStart().setName("lineStart") +lineEnd = LineEnd().setName("lineEnd") +stringStart = StringStart().setName("stringStart") +stringEnd = StringEnd().setName("stringEnd") + +_escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).setParseAction(lambda s,l,t:t[0][1]) +_escapedHexChar = Regex(r"\\0?[xX][0-9a-fA-F]+").setParseAction(lambda s,l,t:unichr(int(t[0].lstrip(r'\0x'),16))) +_escapedOctChar = Regex(r"\\0[0-7]+").setParseAction(lambda s,l,t:unichr(int(t[0][1:],8))) +_singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | Word(printables, excludeChars=r'\]', exact=1) +_charRange = Group(_singleChar + Suppress("-") + _singleChar) +_reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group( OneOrMore( _charRange | _singleChar ) ).setResultsName("body") + "]" + +def srange(s): + r"""Helper to easily define string ranges for use in Word construction. Borrows + syntax from regexp '[]' string range definitions:: + srange("[0-9]") -> "0123456789" + srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" + srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" + The input string must be enclosed in []'s, and the returned string is the expanded + character set joined into a single string. + The values enclosed in the []'s may be:: + a single character + an escaped character with a leading backslash (such as \- or \]) + an escaped hex character with a leading '\x' (\x21, which is a '!' character) + (\0x## is also supported for backwards compatibility) + an escaped octal character with a leading '\0' (\041, which is a '!' character) + a range of any of the above, separated by a dash ('a-z', etc.) + any combination of the above ('aeiouy', 'a-zA-Z0-9_$', etc.) + """ + _expanded = lambda p: p if not isinstance(p,ParseResults) else ''.join(unichr(c) for c in range(ord(p[0]),ord(p[1])+1)) + try: + return "".join(_expanded(part) for part in _reBracketExpr.parseString(s).body) + except: + return "" + +def matchOnlyAtCol(n): + """Helper method for defining parse actions that require matching at a specific + column in the input text. + """ + def verifyCol(strg,locn,toks): + if col(locn,strg) != n: + raise ParseException(strg,locn,"matched token not at column %d" % n) + return verifyCol + +def replaceWith(replStr): + """Helper method for common parse actions that simply return a literal value. Especially + useful when used with C{L{transformString}()}. + """ + def _replFunc(*args): + return [replStr] + return _replFunc + +def removeQuotes(s,l,t): + """Helper parse action for removing quotation marks from parsed quoted strings. + To use, add this parse action to quoted string using:: + quotedString.setParseAction( removeQuotes ) + """ + return t[0][1:-1] + +def upcaseTokens(s,l,t): + """Helper parse action to convert tokens to upper case.""" + return [ tt.upper() for tt in map(_ustr,t) ] + +def downcaseTokens(s,l,t): + """Helper parse action to convert tokens to lower case.""" + return [ tt.lower() for tt in map(_ustr,t) ] + +def keepOriginalText(s,startLoc,t): + """DEPRECATED - use new helper method C{L{originalTextFor}}. + Helper parse action to preserve original parsed text, + overriding any nested parse actions.""" + try: + endloc = getTokensEndLoc() + except ParseException: + raise ParseFatalException("incorrect usage of keepOriginalText - may only be called as a parse action") + del t[:] + t += ParseResults(s[startLoc:endloc]) + return t + +def getTokensEndLoc(): + """Method to be called from within a parse action to determine the end + location of the parsed tokens.""" + import inspect + fstack = inspect.stack() + try: + # search up the stack (through intervening argument normalizers) for correct calling routine + for f in fstack[2:]: + if f[3] == "_parseNoCache": + endloc = f[0].f_locals["loc"] + return endloc + else: + raise ParseFatalException("incorrect usage of getTokensEndLoc - may only be called from within a parse action") + finally: + del fstack + +def _makeTags(tagStr, xml): + """Internal helper to construct opening and closing tag expressions, given a tag name""" + if isinstance(tagStr,basestring): + resname = tagStr + tagStr = Keyword(tagStr, caseless=not xml) + else: + resname = tagStr.name + + tagAttrName = Word(alphas,alphanums+"_-:") + if (xml): + tagAttrValue = dblQuotedString.copy().setParseAction( removeQuotes ) + openTag = Suppress("<") + tagStr("tag") + \ + Dict(ZeroOrMore(Group( tagAttrName + Suppress("=") + tagAttrValue ))) + \ + Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") + else: + printablesLessRAbrack = "".join(c for c in printables if c not in ">") + tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack) + openTag = Suppress("<") + tagStr("tag") + \ + Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \ + Optional( Suppress("=") + tagAttrValue ) ))) + \ + Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">") + closeTag = Combine(_L("") + + openTag = openTag.setResultsName("start"+"".join(resname.replace(":"," ").title().split())).setName("<%s>" % tagStr) + closeTag = closeTag.setResultsName("end"+"".join(resname.replace(":"," ").title().split())).setName("" % tagStr) + openTag.tag = resname + closeTag.tag = resname + return openTag, closeTag + +def makeHTMLTags(tagStr): + """Helper to construct opening and closing tag expressions for HTML, given a tag name""" + return _makeTags( tagStr, False ) + +def makeXMLTags(tagStr): + """Helper to construct opening and closing tag expressions for XML, given a tag name""" + return _makeTags( tagStr, True ) + +def withAttribute(*args,**attrDict): + """Helper to create a validating parse action to be used with start tags created + with C{L{makeXMLTags}} or C{L{makeHTMLTags}}. Use C{withAttribute} to qualify a starting tag + with a required attribute value, to avoid false matches on common tags such as + C{} or C{

}. + + Call C{withAttribute} with a series of attribute names and values. Specify the list + of filter attributes names and values as: + - keyword arguments, as in C{(align="right")}, or + - as an explicit dict with C{**} operator, when an attribute name is also a Python + reserved word, as in C{**{"class":"Customer", "align":"right"}} + - a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") ) + For attribute names with a namespace prefix, you must use the second form. Attribute + names are matched insensitive to upper/lower case. + + To verify that the attribute exists, but without specifying a value, pass + C{withAttribute.ANY_VALUE} as the value. + """ + if args: + attrs = args[:] + else: + attrs = attrDict.items() + attrs = [(k,v) for k,v in attrs] + def pa(s,l,tokens): + for attrName,attrValue in attrs: + if attrName not in tokens: + raise ParseException(s,l,"no matching attribute " + attrName) + if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue: + raise ParseException(s,l,"attribute '%s' has value '%s', must be '%s'" % + (attrName, tokens[attrName], attrValue)) + return pa +withAttribute.ANY_VALUE = object() + +opAssoc = _Constants() +opAssoc.LEFT = object() +opAssoc.RIGHT = object() + +def infixNotation( baseExpr, opList, lpar=Suppress('('), rpar=Suppress(')') ): + """Helper method for constructing grammars of expressions made up of + operators working in a precedence hierarchy. Operators may be unary or + binary, left- or right-associative. Parse actions can also be attached + to operator expressions. + + Parameters: + - baseExpr - expression representing the most basic element for the nested + - opList - list of tuples, one for each operator precedence level in the + expression grammar; each tuple is of the form + (opExpr, numTerms, rightLeftAssoc, parseAction), where: + - opExpr is the pyparsing expression for the operator; + may also be a string, which will be converted to a Literal; + if numTerms is 3, opExpr is a tuple of two expressions, for the + two operators separating the 3 terms + - numTerms is the number of terms for this operator (must + be 1, 2, or 3) + - rightLeftAssoc is the indicator whether the operator is + right or left associative, using the pyparsing-defined + constants C{opAssoc.RIGHT} and C{opAssoc.LEFT}. + - parseAction is the parse action to be associated with + expressions matching this operator expression (the + parse action tuple member may be omitted) + - lpar - expression for matching left-parentheses (default=Suppress('(')) + - rpar - expression for matching right-parentheses (default=Suppress(')')) + """ + ret = Forward() + lastExpr = baseExpr | ( lpar + ret + rpar ) + for i,operDef in enumerate(opList): + opExpr,arity,rightLeftAssoc,pa = (operDef + (None,))[:4] + if arity == 3: + if opExpr is None or len(opExpr) != 2: + raise ValueError("if numterms=3, opExpr must be a tuple or list of two expressions") + opExpr1, opExpr2 = opExpr + thisExpr = Forward()#.setName("expr%d" % i) + if rightLeftAssoc == opAssoc.LEFT: + if arity == 1: + matchExpr = FollowedBy(lastExpr + opExpr) + Group( lastExpr + OneOrMore( opExpr ) ) + elif arity == 2: + if opExpr is not None: + matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( lastExpr + OneOrMore( opExpr + lastExpr ) ) + else: + matchExpr = FollowedBy(lastExpr+lastExpr) + Group( lastExpr + OneOrMore(lastExpr) ) + elif arity == 3: + matchExpr = FollowedBy(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) + \ + Group( lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr ) + else: + raise ValueError("operator must be unary (1), binary (2), or ternary (3)") + elif rightLeftAssoc == opAssoc.RIGHT: + if arity == 1: + # try to avoid LR with this extra test + if not isinstance(opExpr, Optional): + opExpr = Optional(opExpr) + matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( opExpr + thisExpr ) + elif arity == 2: + if opExpr is not None: + matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( lastExpr + OneOrMore( opExpr + thisExpr ) ) + else: + matchExpr = FollowedBy(lastExpr + thisExpr) + Group( lastExpr + OneOrMore( thisExpr ) ) + elif arity == 3: + matchExpr = FollowedBy(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) + \ + Group( lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr ) + else: + raise ValueError("operator must be unary (1), binary (2), or ternary (3)") + else: + raise ValueError("operator must indicate right or left associativity") + if pa: + matchExpr.setParseAction( pa ) + thisExpr <<= ( matchExpr | lastExpr ) + lastExpr = thisExpr + ret <<= lastExpr + return ret +operatorPrecedence = infixNotation + +dblQuotedString = Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*"').setName("string enclosed in double quotes") +sglQuotedString = Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*'").setName("string enclosed in single quotes") +quotedString = Regex(r'''(?:"(?:[^"\n\r\\]|(?:"")|(?:\\x[0-9a-fA-F]+)|(?:\\.))*")|(?:'(?:[^'\n\r\\]|(?:'')|(?:\\x[0-9a-fA-F]+)|(?:\\.))*')''').setName("quotedString using single or double quotes") +unicodeString = Combine(_L('u') + quotedString.copy()) + +def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()): + """Helper method for defining nested lists enclosed in opening and closing + delimiters ("(" and ")" are the default). + + Parameters: + - opener - opening character for a nested list (default="("); can also be a pyparsing expression + - closer - closing character for a nested list (default=")"); can also be a pyparsing expression + - content - expression for items within the nested lists (default=None) + - ignoreExpr - expression for ignoring opening and closing delimiters (default=quotedString) + + If an expression is not provided for the content argument, the nested + expression will capture all whitespace-delimited content between delimiters + as a list of separate values. + + Use the C{ignoreExpr} argument to define expressions that may contain + opening or closing characters that should not be treated as opening + or closing characters for nesting, such as quotedString or a comment + expression. Specify multiple expressions using an C{L{Or}} or C{L{MatchFirst}}. + The default is L{quotedString}, but if no expressions are to be ignored, + then pass C{None} for this argument. + """ + if opener == closer: + raise ValueError("opening and closing strings cannot be the same") + if content is None: + if isinstance(opener,basestring) and isinstance(closer,basestring): + if len(opener) == 1 and len(closer)==1: + if ignoreExpr is not None: + content = (Combine(OneOrMore(~ignoreExpr + + CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS,exact=1)) + ).setParseAction(lambda t:t[0].strip())) + else: + content = (empty.copy()+CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS + ).setParseAction(lambda t:t[0].strip())) + else: + if ignoreExpr is not None: + content = (Combine(OneOrMore(~ignoreExpr + + ~Literal(opener) + ~Literal(closer) + + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) + ).setParseAction(lambda t:t[0].strip())) + else: + content = (Combine(OneOrMore(~Literal(opener) + ~Literal(closer) + + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1)) + ).setParseAction(lambda t:t[0].strip())) + else: + raise ValueError("opening and closing arguments must be strings if no content expression is given") + ret = Forward() + if ignoreExpr is not None: + ret <<= Group( Suppress(opener) + ZeroOrMore( ignoreExpr | ret | content ) + Suppress(closer) ) + else: + ret <<= Group( Suppress(opener) + ZeroOrMore( ret | content ) + Suppress(closer) ) + return ret + +def indentedBlock(blockStatementExpr, indentStack, indent=True): + """Helper method for defining space-delimited indentation blocks, such as + those used to define block statements in Python source code. + + Parameters: + - blockStatementExpr - expression defining syntax of statement that + is repeated within the indented block + - indentStack - list created by caller to manage indentation stack + (multiple statementWithIndentedBlock expressions within a single grammar + should share a common indentStack) + - indent - boolean indicating whether block must be indented beyond the + the current level; set to False for block of left-most statements + (default=True) + + A valid block must contain at least one C{blockStatement}. + """ + def checkPeerIndent(s,l,t): + if l >= len(s): return + curCol = col(l,s) + if curCol != indentStack[-1]: + if curCol > indentStack[-1]: + raise ParseFatalException(s,l,"illegal nesting") + raise ParseException(s,l,"not a peer entry") + + def checkSubIndent(s,l,t): + curCol = col(l,s) + if curCol > indentStack[-1]: + indentStack.append( curCol ) + else: + raise ParseException(s,l,"not a subentry") + + def checkUnindent(s,l,t): + if l >= len(s): return + curCol = col(l,s) + if not(indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]): + raise ParseException(s,l,"not an unindent") + indentStack.pop() + + NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()) + INDENT = Empty() + Empty().setParseAction(checkSubIndent) + PEER = Empty().setParseAction(checkPeerIndent) + UNDENT = Empty().setParseAction(checkUnindent) + if indent: + smExpr = Group( Optional(NL) + + #~ FollowedBy(blockStatementExpr) + + INDENT + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) + UNDENT) + else: + smExpr = Group( Optional(NL) + + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) ) + blockStatementExpr.ignore(_bslash + LineEnd()) + return smExpr + +alphas8bit = srange(r"[\0xc0-\0xd6\0xd8-\0xf6\0xf8-\0xff]") +punc8bit = srange(r"[\0xa1-\0xbf\0xd7\0xf7]") + +anyOpenTag,anyCloseTag = makeHTMLTags(Word(alphas,alphanums+"_:")) +commonHTMLEntity = Combine(_L("&") + oneOf("gt lt amp nbsp quot").setResultsName("entity") +";").streamline() +_htmlEntityMap = dict(zip("gt lt amp nbsp quot".split(),'><& "')) +replaceHTMLEntity = lambda t : t.entity in _htmlEntityMap and _htmlEntityMap[t.entity] or None + +# it's easy to get these comment structures wrong - they're very common, so may as well make them available +cStyleComment = Regex(r"/\*(?:[^*]*\*+)+?/").setName("C style comment") + +htmlComment = Regex(r"") +restOfLine = Regex(r".*").leaveWhitespace() +dblSlashComment = Regex(r"\/\/(\\\n|.)*").setName("// comment") +cppStyleComment = Regex(r"/(?:\*(?:[^*]*\*+)+?/|/[^\n]*(?:\n[^\n]*)*?(?:(?" + str(tokenlist)) + print ("tokens = " + str(tokens)) + print ("tokens.columns = " + str(tokens.columns)) + print ("tokens.tables = " + str(tokens.tables)) + print (tokens.asXML("SQL",True)) + except ParseBaseException as err: + print (teststring + "->") + print (err.line) + print (" "*(err.column-1) + "^") + print (err) + print() + + selectToken = CaselessLiteral( "select" ) + fromToken = CaselessLiteral( "from" ) + + ident = Word( alphas, alphanums + "_$" ) + columnName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) + columnNameList = Group( delimitedList( columnName ) )#.setName("columns") + tableName = delimitedList( ident, ".", combine=True ).setParseAction( upcaseTokens ) + tableNameList = Group( delimitedList( tableName ) )#.setName("tables") + simpleSQL = ( selectToken + \ + ( '*' | columnNameList ).setResultsName( "columns" ) + \ + fromToken + \ + tableNameList.setResultsName( "tables" ) ) + + test( "SELECT * from XYZZY, ABC" ) + test( "select * from SYS.XYZZY" ) + test( "Select A from Sys.dual" ) + test( "Select AA,BB,CC from Sys.dual" ) + test( "Select A, B, C from Sys.dual" ) + test( "Select A, B, C from Sys.dual" ) + test( "Xelect A, B, C from Sys.dual" ) + test( "Select A, B, C frox Sys.dual" ) + test( "Select" ) + test( "Select ^^^ frox Sys.dual" ) + test( "Select A, B, C from Sys.dual, Table2 " ) diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/README remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/README --- remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/README 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/pyparsing/README 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,72 @@ +==================================== +PyParsing -- A Python Parsing Module +==================================== + +Introduction +============ + +The pyparsing module is an alternative approach to creating and executing +simple grammars, vs. the traditional lex/yacc approach, or the use of +regular expressions. The pyparsing module provides a library of classes +that client code uses to construct the grammar directly in Python code. + +Here is a program to parse "Hello, World!" (or any greeting of the form +", !"): + + from pyparsing import Word, alphas + greet = Word( alphas ) + "," + Word( alphas ) + "!" + hello = "Hello, World!" + print hello, "->", greet.parseString( hello ) + +The program outputs the following: + + Hello, World! -> ['Hello', ',', 'World', '!'] + +The Python representation of the grammar is quite readable, owing to the +self-explanatory class names, and the use of '+', '|' and '^' operator +definitions. + +The parsed results returned from parseString() can be accessed as a +nested list, a dictionary, or an object with named attributes. + +The pyparsing module handles some of the problems that are typically +vexing when writing text parsers: +- extra or missing whitespace (the above program will also handle + "Hello,World!", "Hello , World !", etc.) +- quoted strings +- embedded comments + +The .zip file includes examples of a simple SQL parser, simple CORBA IDL +parser, a config file parser, a chemical formula parser, and a four- +function algebraic notation parser. It also includes a simple how-to +document, and a UML class diagram of the library's classes. + + + +Installation +============ + +Do the usual: + + python setup.py install + +(pyparsing requires Python 2.3.2 or later.) + + +Documentation +============= + +See: + + HowToUsePyparsing.html + + +License +======= + + MIT License. See header of pyparsing.py + +History +======= + + See CHANGES file. diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/tablestream/tablestream.py remnux-oletools-0.51a/remnux-oletools/thirdparty/tablestream/tablestream.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/tablestream/tablestream.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/tablestream/tablestream.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,396 @@ +#!/usr/bin/env python +""" +tablestream + +tablestream can format table data for pretty printing as text, +to be displayed on the console or written to any file-like object. +The table data can be provided as rows, each row is an iterable of +cells. The text in each cell is wrapped to fit into a maximum width +set for each column. +Contrary to many table pretty printing libraries, TableStream writes +each row to the output as soon as it is provided, and the whole table +does not need to be built in memory before printing. +It is therefore suitable for large tables, or tables that take time to +be processed row by row. + +Author: Philippe Lagadec - http://www.decalage.info +License: BSD, see source code or documentation +""" + +#=== LICENSE ================================================================== + +# tablestream is copyright (c) 2015-2016 Philippe Lagadec (http://www.decalage.info) +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import print_function + +#------------------------------------------------------------------------------ +# CHANGELOG: +# 2015-11-01 v0.01 PL: - first version +# 2016-01-01 v0.02 PL: - added styles, color support +# 2016-04-19 v0.03 PL: - enable colorclass on Windows, fixed issue #39 +# 2016-05-25 v0.04 PL: - updated for colorclass 2.2.0 (now a package) +# 2016-07-29 v0.05 PL: - fixed oletools issue #57, bug when importing colorclass +# 2016-07-31 v0.06 PL: - handle newline characters properly in each cell +# 2016-08-28 v0.07 PL: - support for both Python 2.6+ and 3.x +# - all cells are converted to unicode + +__version__ = '0.07' + +#------------------------------------------------------------------------------ +# TODO: +# - several styles +# - colorized rows or cells +# - automatic width for the last column, based on max total width +# - automatic width for selected columns, based on N first lines +# - determine the console width + +# === IMPORTS ================================================================= + +import textwrap +import sys, os + +# add the thirdparty subfolder to sys.path (absolute+normalized path): +_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) +# print('_thismodule_dir = %r' % _thismodule_dir) +# assumption: this module is in a subfolder of thirdparty: +_thirdparty_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) +# print('_thirdparty_dir = %r' % _thirdparty_dir) +if not _thirdparty_dir in sys.path: + sys.path.insert(0, _thirdparty_dir) + +import colorclass + +# On Windows, colorclass needs to be enabled: +if os.name == 'nt': + colorclass.Windows.enable(auto_colors=True) + + +# === PYTHON 2+3 SUPPORT ====================================================== + +if sys.version_info[0] >= 3: + # Python 3 specific adaptations + # py3 range = py2 xrange + xrange = range + ustr = str + # byte strings for to_ustr (with py3, bytearray supports encoding): + byte_strings = (bytes, bytearray) +else: + # Python 2 specific adaptations + ustr = unicode + # byte strings for to_ustr (with py2, bytearray does not support encoding): + byte_strings = bytes + + +# === FUNCTIONS ============================================================== + +def to_ustr(obj, encoding='utf8', errors='replace'): + """ + convert an object to unicode, using the appropriate method + :param obj: any object, str, bytes or unicode + :return: unicode string (ustr) + """ + # if the object is already unicode, return it unchanged: + if isinstance(obj, ustr): + return obj + # if it is a bytes string, decode it using the provided encoding + elif isinstance(obj, byte_strings): + return ustr(obj, encoding=encoding, errors=errors) + # else just convert it to unicode: + # (an exception is raised if we specify encoding in this case) + else: + return ustr(obj) + + + +# === CLASSES ================================================================= + + +class TableStyle(object): + """ + Style for a TableStream. + This base class can be derived to create new styles. + Default style: + +------+---+ + |Header| + + +------+---+ + | | | + +------+---+ + """ + # Header rows: + header_top = True + header_top_left = u'+' + header_top_horiz = u'-' + header_top_middle = u'+' + header_top_right = u'+' + + header_vertical_left = u'|' + header_vertical_middle = u'|' + header_vertical_right = u'|' + + # Separator line between header and normal rows: + header_sep = True + header_sep_left = u'+' + header_sep_horiz = u'-' + header_sep_middle = u'+' + header_sep_right = u'+' + + # Top row if there is no header: + noheader_top = True + noheader_top_left = u'+' + noheader_top_horiz = u'-' + noheader_top_middle = u'+' + noheader_top_right = u'+' + + # Normal rows + vertical_left = u'|' + vertical_middle = u'|' + vertical_right = u'|' + + # Separator line between rows: + sep = False + sep_left = u'+' + sep_horiz = u'-' + sep_middle = u'+' + sep_right = u'+' + + # Bottom line + bottom = True + bottom_left = u'+' + bottom_horiz = u'-' + bottom_middle = u'+' + bottom_right = u'+' + + +class TableStyleSlim(object): + """ + Style for a TableStream. + Example: + ------+--- + Header| + ------+--- + | + ------+--- + """ + # Header rows: + header_top = True + header_top_left = u'' + header_top_horiz = u'-' + header_top_middle = u'+' + header_top_right = u'' + + header_vertical_left = u'' + header_vertical_middle = u'|' + header_vertical_right = u'' + + # Separator line between header and normal rows: + header_sep = True + header_sep_left = u'' + header_sep_horiz = u'-' + header_sep_middle = u'+' + header_sep_right = u'' + + # Top row if there is no header: + noheader_top = True + noheader_top_left = u'' + noheader_top_horiz = u'-' + noheader_top_middle = u'+' + noheader_top_right = u'' + + # Normal rows + vertical_left = u'' + vertical_middle = u'|' + vertical_right = u'' + + # Separator line between rows: + sep = False + sep_left = u'' + sep_horiz = u'-' + sep_middle = u'+' + sep_right = u'' + + # Bottom line + bottom = True + bottom_left = u'' + bottom_horiz = u'-' + bottom_middle = u'+' + bottom_right = u'' + + + +class TableStream(object): + """ + a TableStream object can format table data for pretty printing as text, + to be displayed on the console or written to any file-like object. + The table data can be provided as rows, each row is an iterable of + cells. The text in each cell is wrapped to fit into a maximum width + set for each column. + Contrary to many table pretty printing libraries, TableStream writes + each row to the output as soon as it is provided, and the whole table + does not need to be built in memory before printing. + It is therefore suitable for large tables, or tables that take time to + be processed row by row. + """ + + def __init__(self, column_width, header_row=None, style=TableStyle, + outfile=sys.stdout, encoding_in='utf8', encoding_out='utf8'): + ''' + Constructor for class TableStream + :param column_width: tuple or list containing the width of each column + :param header_row: tuple or list containing the header row text + :param style: style for the table, a TableStyle object + :param outfile: output file (sys.stdout by default to print on the console) + :param encoding_in: encoding used when the input text is bytes (UTF-8 by default) + :param encoding_out: encoding used for the output (UTF-8 by default) + ''' + self.column_width = column_width + self.num_columns = len(column_width) + self.header_row = header_row + self.encoding_in = encoding_in + self.encoding_out = encoding_out + assert (header_row is None) or len(header_row) == self.num_columns + self.style = style + self.outfile = outfile + if header_row is not None: + self.write_header() + elif self.style.noheader_top: + self.write_noheader_top() + + + def write(self, s): + """ + shortcut for self.outfile.write() + """ + self.outfile.write(s) + + def write_row(self, row, last=False, colors=None): + assert len(row) == self.num_columns + columns = [] + max_lines = 0 + for i in xrange(self.num_columns): + cell = row[i] + # Convert to string: + cell = to_ustr(cell, encoding=self.encoding_in) + # Wrap cell text according to the column width + # TODO: use a TextWrapper object for each column instead + # split the string if it contains newline characters, otherwise + # textwrap replaces them with spaces: + column = [] + for line in cell.splitlines(): + column.extend(textwrap.wrap(line, width=self.column_width[i])) + # apply colors to each line of the cell if needed: + if colors is not None and self.outfile.isatty(): + color = colors[i] + if color: + for j in xrange(len(column)): + # print '%r: %s' % (column[j], type(column[j])) + column[j] = colorclass.Color(u'{auto%s}%s{/%s}' % (color, column[j], color)) + columns.append(column) + # determine which column has the highest number of lines + max_lines = max(len(columns[i]), max_lines) + # transpose: write output line by line + for j in xrange(max_lines): + self.write(self.style.vertical_left) + for i in xrange(self.num_columns): + column = columns[i] + if j file_name is not a glob + --> file?name is a glob + --> file* is a glob + --> file[-._]name is a glob + --> file[?]name is not a glob (matches literal "file?name") + --> file[*]name is not a glob (matches literal "file*name") + --> file[-]name is not a glob (matches literal "file-name") + --> file-name is not a glob + + Also, obviously incorrect globs are treated as non-globs + --> file[name is not a glob (matches literal "file[name") + --> file]-[name is treated as a glob + (it is not a valid glob but detecting errors like this requires + sophisticated regular expression matching) + + Python's glob also works with globs in directory-part of path + --> dir-part of path is analyzed just like filename-part + --> thirdparty/*/xglob.py is a (valid) glob + + TODO: create a correct regexp to test for validity of ranges + """ + + # remove escaped special chars + cleaned = filespec.replace('[*]', '').replace('[?]', '') \ + .replace('[[]', '').replace('[]]', '').replace('[-]', '') + + # check if special chars remain + return '*' in cleaned or '?' in cleaned or \ + ('[' in cleaned and ']' in cleaned) diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/LICENSE.txt remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/LICENSE.txt --- remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/LICENSE.txt 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/LICENSE.txt 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,3 @@ +xxxswf.py is published by Alexander Hanel on +http://hooked-on-mnemonics.blogspot.nl/2011/12/xxxswfpy.html +without explicit license. \ No newline at end of file diff -Nru remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/xxxswf.py remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/xxxswf.py --- remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/xxxswf.py 1970-01-01 00:00:00.000000000 +0000 +++ remnux-oletools-0.51a/remnux-oletools/thirdparty/xxxswf/xxxswf.py 2016-11-04 21:28:21.000000000 +0000 @@ -0,0 +1,373 @@ +# xxxswf.py was created by alexander dot hanel at gmail dot com +# version 0.1 +# Date - 12-07-2011 +# To do list +# - Tag Parser +# - ActionScript Decompiler + +# 2016-11-01 PL: - A few changes for Python 2+3 compatibility + +import fnmatch +import hashlib +import imp +import math +import os +import re +import struct +import sys +import time +from io import BytesIO +from optparse import OptionParser +import zlib + +def checkMD5(md5): +# checks if MD5 has been seen in MD5 Dictionary +# MD5Dict contains the MD5 and the CVE +# For { 'MD5':'CVE', 'MD5-1':'CVE-1', 'MD5-2':'CVE-2'} + MD5Dict = {'c46299a5015c6d31ad5766cb49e4ab4b':'CVE-XXXX-XXXX'} + if MD5Dict.get(md5): + print('\t[BAD] MD5 Match on', MD5Dict.get(md5)) + return + +def bad(f): + for idx, x in enumerate(findSWF(f)): + tmp = verifySWF(f,x) + if tmp != None: + yaraScan(tmp) + checkMD5(hashBuff(tmp)) + return + +def yaraScan(d): +# d = buffer of the read file +# Scans SWF using Yara + # test if yara module is installed + # if not Yara can be downloaded from http://code.google.com/p/yara-project/ + try: + imp.find_module('yara') + import yara + except ImportError: + print('\t[ERROR] Yara module not installed - aborting scan') + return + # test for yara compile errors + try: + r = yara.compile(r'rules.yar') + except: + pass + print('\t[ERROR] Yara compile error - aborting scan') + return + # get matches + m = r.match(data=d) + # print matches + for X in m: + print('\t[BAD] Yara Signature Hit: %s' % X) + return + +def findSWF(d): +# d = buffer of the read file +# Search for SWF Header Sigs in files + return [tmp.start() for tmp in re.finditer(b'CWS|FWS', d.read())] + +def hashBuff(d): +# d = buffer of the read file +# This function hashes the buffer +# source: http://stackoverflow.com/q/5853830 + if type(d) is str: + d = BytesIO(d) + md5 = hashlib.md5() + while True: + data = d.read(128) + if not data: + break + md5.update(data) + return md5.hexdigest() + +def verifySWF(f,addr): + # Start of SWF + f.seek(addr) + # Read Header + header = f.read(3) + # Read Version + ver = struct.unpack(' 20: + print(' - [ERROR] Invalid SWF Version') + return None + + if b'CWS' in header: + try: + f.read(3) + tmp = b'FWS' + f.read(5) + zlib.decompress(f.read()) + print(' - CWS Header') + return tmp + + except: + pass + print('- [ERROR]: Zlib decompression error. Invalid CWS SWF') + return None + + elif b'FWS' in header: + try: + tmp = f.read(size) + print(' - FWS Header') + return tmp + + except: + pass + print(' - [ERROR] Invalid SWF Size') + return None + + else: + print(' - [Error] Logic Error Blame Programmer') + return None + +def headerInfo(f): +# f is the already opended file handle +# Yes, the format is is a rip off SWFDump. Can you blame me? Their tool is awesome. + # SWFDump FORMAT + # [HEADER] File version: 8 + # [HEADER] File is zlib compressed. Ratio: 52% + # [HEADER] File size: 37536 + # [HEADER] Frame rate: 18.000000 + # [HEADER] Frame count: 323 + # [HEADER] Movie width: 217.00 + # [HEADER] Movie height: 85.00 + if type(f) is str: + f = BytesIO(f) + sig = f.read(3) + print('\t[HEADER] File header: %s' % sig) + if b'C' in sig: + print('\t[HEADER] File is zlib compressed.') + version = struct.unpack('> 3 + print('\t[HEADER] Rect Nbit: %d' % nbit) + # Curretely the nbit is static at 15. This could be modified in the + # future. If larger than 9 this will break the struct unpack. Will have + # to revist must be a more effective way to deal with bits. Tried to keep + # the algo but damn this is ugly... + f.seek(ta) + rect = struct.unpack('>Q', f.read(int(math.ceil((nbit*4)/8.0))))[0] + tmp = struct.unpack('>7)[2:].zfill(1) + # bin requires Python 2.6 or higher + # skips string '0b' and the nbit + rect = bin(rect)[7:] + xmin = int(rect[0:nbit-1],2) + print('\t[HEADER] Rect Xmin: %d' % xmin) + xmax = int(rect[nbit:(nbit*2)-1],2) + print('\t[HEADER] Rect Xmax: %d' % xmax) + ymin = int(rect[nbit*2:(nbit*3)-1],2) + print('\t[HEADER] Rect Ymin: %d' % ymin) + # one bit needs to be added, my math might be off here + ymax = int(rect[nbit*3:(nbit*4)-1] + str(tmp) ,2) + print('\t[HEADER] Rect Ymax: %d' % ymax) + framerate = struct.unpack('

-Data
		DEBUG_MODE = False -DEFAULT_PATH_ENCODING = 'utf-8' -DEFECT_FATAL = 40 -DEFECT_INCORRECT = 30 -DEFECT_POTENTIAL = 20 -DEFECT_UNSURE = 10 -DIFSECT = 4294967292L -ENDOFCHAIN = 4294967294L -FATSECT = 4294967293L -FREESECT = 4294967295L -KEEP_UNICODE_NAMES = True -MAGIC = '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' -MAXREGSECT = 4294967290L -MAXREGSID = 4294967290L -MINIMAL_OLEFILE_SIZE = 1536 -NOSTREAM = 4294967295L -STGTY_EMPTY = 0 -STGTY_LOCKBYTES = 3 -STGTY_PROPERTY = 4 -STGTY_ROOT = 5 -STGTY_STORAGE = 1 -STGTY_STREAM = 2 -UINT32 = 'L' -VT = {0: 'VT_EMPTY', 1: 'VT_NULL', 2: 'VT_I2', 3: 'VT_I4', 4: 'VT_R4', 5: 'VT_R8', 6: 'VT_CY', 7: 'VT_DATE', 8: 'VT_BSTR', 9: 'VT_DISPATCH', ...} -VT_BLOB = 65 -VT_BLOB_OBJECT = 70 -VT_BOOL = 11 -VT_BSTR = 8 -VT_CARRAY = 28 -VT_CF = 71 -VT_CLSID = 72 -VT_CY = 6 -VT_DATE = 7 -VT_DECIMAL = 14 -VT_DISPATCH = 9 -VT_EMPTY = 0 -VT_ERROR = 10 -VT_FILETIME = 64 -VT_HRESULT = 25 -VT_I1 = 16 -VT_I2 = 2 -VT_I4 = 3 -VT_I8 = 20 -VT_INT = 22 -VT_LPSTR = 30 -VT_LPWSTR = 31 -VT_NULL = 1 -VT_PTR = 26 -VT_R4 = 4 -VT_R8 = 5 -VT_SAFEARRAY = 27 -VT_STORAGE = 67 -VT_STORED_OBJECT = 69 -VT_STREAM = 66 -VT_STREAMED_OBJECT = 68 -VT_UI1 = 17 -VT_UI2 = 18 -VT_UI4 = 19 -VT_UI8 = 21 -VT_UINT = 23 -VT_UNKNOWN = 13 -VT_USERDEFINED = 29 -VT_VARIANT = 12 -VT_VECTOR = 4096 -VT_VOID = 24 -WORD_CLSID = '00020900-0000-0000-C000-000000000046' -__author__ = 'Philippe Lagadec' -__date__ = '2015-01-24' -__version__ = '0.42' -keyword = 'VT_UNKNOWN' -print_function = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 65536) -var = 13

+Data
		DEBUG_MODE = False +DEFAULT_PATH_ENCODING = 'utf-8' +DEFECT_FATAL = 40 +DEFECT_INCORRECT = 30 +DEFECT_POTENTIAL = 20 +DEFECT_UNSURE = 10 +DIFSECT = 4294967292L +ENDOFCHAIN = 4294967294L +FATSECT = 4294967293L +FREESECT = 4294967295L +KEEP_UNICODE_NAMES = True +MAGIC = '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' +MAXREGSECT = 4294967290L +MAXREGSID = 4294967290L +MINIMAL_OLEFILE_SIZE = 1536 +NOSTREAM = 4294967295L +STGTY_EMPTY = 0 +STGTY_LOCKBYTES = 3 +STGTY_PROPERTY = 4 +STGTY_ROOT = 5 +STGTY_STORAGE = 1 +STGTY_STREAM = 2 +UINT32 = 'L' +VT = {0: 'VT_EMPTY', 1: 'VT_NULL', 2: 'VT_I2', 3: 'VT_I4', 4: 'VT_R4', 5: 'VT_R8', 6: 'VT_CY', 7: 'VT_DATE', 8: 'VT_BSTR', 9: 'VT_DISPATCH', ...} +VT_BLOB = 65 +VT_BLOB_OBJECT = 70 +VT_BOOL = 11 +VT_BSTR = 8 +VT_CARRAY = 28 +VT_CF = 71 +VT_CLSID = 72 +VT_CY = 6 +VT_DATE = 7 +VT_DECIMAL = 14 +VT_DISPATCH = 9 +VT_EMPTY = 0 +VT_ERROR = 10 +VT_FILETIME = 64 +VT_HRESULT = 25 +VT_I1 = 16 +VT_I2 = 2 +VT_I4 = 3 +VT_I8 = 20 +VT_INT = 22 +VT_LPSTR = 30 +VT_LPWSTR = 31 +VT_NULL = 1 +VT_PTR = 26 +VT_R4 = 4 +VT_R8 = 5 +VT_SAFEARRAY = 27 +VT_STORAGE = 67 +VT_STORED_OBJECT = 69 +VT_STREAM = 66 +VT_STREAMED_OBJECT = 68 +VT_UI1 = 17 +VT_UI2 = 18 +VT_UI4 = 19 +VT_UI8 = 21 +VT_UINT = 23 +VT_UNKNOWN = 13 +VT_USERDEFINED = 29 +VT_VARIANT = 12 +VT_VECTOR = 4096 +VT_VOID = 24 +WORD_CLSID = '00020900-0000-0000-C000-000000000046' +__author__ = 'Philippe Lagadec' +__date__ = '2015-01-24' +__version__ = '0.42' +keyword = 'VT_UNKNOWN' +print_function = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 65536) +var = 13