diff -Nru tcpflow-1.4.4+repack1/bootstrap.sh tcpflow-1.4.5+repack1/bootstrap.sh --- tcpflow-1.4.4+repack1/bootstrap.sh 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/bootstrap.sh 2015-08-26 03:34:50.000000000 +0000 @@ -1,7 +1,17 @@ -#!/bin/sh +#!/bin/bash # Hopefully you checked out with: # $ git clone --recursive https://github.com/simsong/tcpflow.git +# Make sure we have automake installed +function usage() { + echo tcpflow bootstrap: + echo be sure that these packages are installed: + echo automake autoconf gcc gcc-c++ boost-devel openssl-devel libpcap-devel + exit 1 +} + +automake --help 1>/dev/null 2>&1 || usage + for sub in be13_api dfxml http-parser do if [ ! -r src/$sub/.git ] ; diff -Nru tcpflow-1.4.4+repack1/ChangeLog tcpflow-1.4.5+repack1/ChangeLog --- tcpflow-1.4.4+repack1/ChangeLog 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/ChangeLog 2015-08-26 03:34:50.000000000 +0000 @@ -1,3 +1,7 @@ +2014-02-24 Man Page + + * src/tcpflow.cpp (main): alternating color output option changed from -J to -g + 2013-12-06 Basic * configure.ac: upped version number to 1.4.3 diff -Nru tcpflow-1.4.4+repack1/configure.ac tcpflow-1.4.5+repack1/configure.ac --- tcpflow-1.4.4+repack1/configure.ac 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/configure.ac 2015-08-26 03:35:28.000000000 +0000 @@ -7,7 +7,7 @@ # and http://www.openismus.com/documents/linux/automake/automake.shtml AC_PREREQ(2.57) -AC_INIT(TCPFLOW, 1.4.4, bugs@afflib.org) +AC_INIT(TCPFLOW, 1.4.5, simsong@github.com) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_FILES([Makefile src/Makefile tests/Makefile doc/Makefile]) @@ -28,6 +28,10 @@ AM_PROG_CC_C_O dnl allow per-product flags AC_PROG_INSTALL +m4_include([m4/slg_searchdirs.m4]) +m4_include([m4/slg_gcc_all_warnings.m4]) + + # use C++11 mode if available; HAVE_CXX11 is defined in config.h if so. Don't # use the GNU C++11 extensions for portability's sake (noext). AC_LANG_PUSH(C++) @@ -104,7 +108,6 @@ # causes configure to crash on gcc-4.2.1: -Wsign-compare-Winline # causes warnings with unistd.h: -Wnested-externs # Just causes too much annoyance: -Wmissing-format-attribute - # Check GCC WARNINGS_TO_TEST="-MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wmissing-declarations -Wmissing-prototypes \ -Wshadow -Wwrite-strings -Wcast-align -Waggregate-return \ @@ -183,19 +186,9 @@ AC_LANG_POP() ################################################################ -## Includes - -m4_include([src/dfxml/src/dfxml_configure.m4]) -m4_include([src/be13_api/be13_configure.m4]) - -################################################################ ## -# sqlite3 is fun -AC_CHECK_HEADERS([sqlite3.h]) -AC_CHECK_LIB([sqlite3],[sqlite3_open]) - # # ZLIB is required for decompressing # Note you cannot put comments in the AC_MSG_ERROR for some reason @@ -218,6 +211,7 @@ ################################################################ ## OpenSSL Support (required for AFFLIB and hash_t ) AC_CHECK_HEADERS([openssl/aes.h openssl/bio.h openssl/evp.h openssl/hmac.h openssl/md5.h openssl/pem.h openssl/rand.h openssl/rsa.h openssl/sha.h openssl/pem.h openssl/x509.h]) +AC_CHECK_LIB([dl],[dlopen]) dnl apparently OpenSSL now needs -ldl on some Linux AC_CHECK_LIB([crypto],[EVP_get_digestbyname]) # if crypto is available, get it AC_CHECK_LIB([md],[MD5]) # if libmd is available, get it AC_CHECK_LIB([ssl],[SSL_library_init],, @@ -225,6 +219,12 @@ AC_CHECK_FUNCS([MD5_Init EVP_get_digestbyname]) ################################################################ +## Includes + +m4_include([src/dfxml/src/dfxml_configure.m4]) +m4_include([src/be13_api/be13_configure.m4]) + +################################################################ # PTHREAD support # With special nods to compiling under mingw @@ -235,6 +235,7 @@ CXXFLAGS="$CXXFLAGS -mthreads " AC_DEFINE(HAVE_PTHREAD,1,[Defined to POSIX threads for mingw]) #AC_MSG_NOTICE([pthreads now disabled under mingw]) + AC_CHECK_LIB([pthread],[pthread_create]) else m4_include([m4/ax_pthread.m4]) AX_PTHREAD([ @@ -285,41 +286,42 @@ ################################################################ # drawing support via cairo # +cairo=test +AC_ARG_ENABLE([cairo],[ --enable-cairo=false to disable libcairo even if present]) + if test "${enable_cairo}" = false ; then + cairo=false + fi -# Cairo requires these to be explicitly included on mingw (and perhaps others): -AC_CHECK_LIB([expat],[XML_ParserCreate]) -AC_CHECK_LIB([pixman-1],[pixman_region_init]) -AC_CHECK_LIB([bz2],[BZ2_bzDecompress]) -AC_CHECK_LIB([freetype],[FT_Init_FreeType]) # requires bz2 -AC_CHECK_LIB([fontconfig],[FcBlanksCreate]) # requires freetype expat - -AC_CHECK_HEADERS([cairo/cairo.h cairo/cairo-pdf.h]) -AC_CHECK_HEADERS([cairo.h cairo-pdf.h]) -AC_CHECK_LIB([cairo],[cairo_create], , [ - AC_MSG_WARN([ -*** cairo libraries not detected. -*** Please install cairo-devel to get 1-page PDF summary generation. -]) - Fmissing_library="cairo-devel $missing_library " - Umissing_library="libcairo2-dev $missing_library " - Mmissing_library="cairo-devel " -]) +if test $cairo = test ; then + # Cairo requires these to be explicitly included on mingw (and perhaps others): + AC_CHECK_LIB([expat],[XML_ParserCreate]) + AC_CHECK_LIB([pixman-1],[pixman_region_init]) + AC_CHECK_LIB([bz2],[BZ2_bzDecompress]) + AC_CHECK_LIB([freetype],[FT_Init_FreeType]) # requires bz2 + AC_CHECK_LIB([fontconfig],[FcBlanksCreate]) # requires freetype expat + + AC_CHECK_HEADERS([cairo/cairo.h cairo/cairo-pdf.h]) + AC_CHECK_HEADERS([cairo.h cairo-pdf.h]) + AC_CHECK_LIB([cairo],[cairo_create], , [ + AC_MSG_WARN([ + *** cairo libraries not detected. + *** Please install cairo-devel to get 1-page PDF summary generation. + ]) + Fmissing_library="cairo-devel $missing_library " + Umissing_library="libcairo2-dev $missing_library " + Mmissing_library="cairo-devel " + ]) +fi ################################################################ # pcap support. A bit more involved than normal due to the error message # -pcap=test -AC_ARG_ENABLE([pcap],[ --enable-pcap=false to disable libpcap even if present]) - if test "${enableval}" = false ; then - pcap=false - fi - -if test $pcap = test ; then - AC_CHECK_HEADERS(pcap.h pcap/pcap.h ) - if test x"$mingw" = x"yes" ; then +AC_CHECK_HEADERS(pcap.h pcap/pcap.h ) +if test x"$mingw" = x"yes" ; then AC_MSG_WARN([pcap not supported under mingw]) - else +else AC_CHECK_LIB(pcap, pcap_lookupdev, , [ + enable_pcap=no AC_MSG_WARN([ Can't find the pcap library (libpcap.a). tcpflow will not live capture or compile rules without pcap! @@ -340,9 +342,20 @@ Umissing_library="$Umissing_library libpcap-dev " Mmissing_library="$Mmissing_library libpcap " ]) - fi fi +dnl set with_wifi to 0 if you do not want it +AC_ARG_ENABLE([wifi], + AS_HELP_STRING([--disable-wifi], [Disable WIFI decoding]), + [], + [ + if test x"no" = x"$mingw"; then + AC_DEFINE(USE_WIFI, 1, [Use WIFI decompression]) + wifi="yes" + fi + ]) +AM_CONDITIONAL([WIFI_ENABLED], [test "yes" = "$wifi"]) + ################################################################ # Specify our other headers diff -Nru tcpflow-1.4.4+repack1/debian/changelog tcpflow-1.4.5+repack1/debian/changelog --- tcpflow-1.4.4+repack1/debian/changelog 2014-05-31 19:49:52.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/changelog 2015-08-26 05:00:36.000000000 +0000 @@ -1,3 +1,10 @@ +tcpflow (1.4.5+repack1-1) unstable; urgency=medium + + * New upstream release + * -b 0 now works (Closes: #638804) + + -- Dima Kogan Tue, 25 Aug 2015 20:55:52 -0700 + tcpflow (1.4.4+repack1-3) unstable; urgency=medium * Bug fix: "tcpflow with no -i doesn't work", thanks to diff -Nru tcpflow-1.4.4+repack1/debian/control tcpflow-1.4.5+repack1/debian/control --- tcpflow-1.4.4+repack1/debian/control 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/control 2015-08-26 04:20:24.000000000 +0000 @@ -2,7 +2,7 @@ Section: net Priority: optional Maintainer: Dima Kogan -Standards-Version: 3.9.5 +Standards-Version: 3.9.6 Build-Depends: debhelper (>= 9), libpcap0.8-dev, zlib1g, diff -Nru tcpflow-1.4.4+repack1/debian/copyright tcpflow-1.4.5+repack1/debian/copyright --- tcpflow-1.4.4+repack1/debian/copyright 2014-05-31 19:15:59.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/copyright 2015-08-26 04:43:28.000000000 +0000 @@ -3,12 +3,7 @@ Upstream-Contact: Simson Garfinkel Upstream-Name: tcpflow -Files: * -Copyright: 1999-2003 Jeremy Elson - 2012-2013 Simson Garfinkel -License: GPL-3 - -Files: src/cpack.* +Files: src/wifipcap/cpack.* Copyright: 2003-2004 David Young License: BSD-3clause-Young @@ -44,7 +39,7 @@ 2009 Johannes Berg License: GPL-2 -Files: src/wifipcap.cpp +Files: src/wifipcap/wifipcap.cpp Copyright: 1998 Gerald Combs 2006-2007 Doug Madory 2008 Jeff Pang @@ -55,7 +50,7 @@ (GPL-3). Combination is GPL-3 -Files: src/wifipcap.h +Files: src/wifipcap/wifipcap.h Copyright: 2006-2007 Doug Madory 2008 Jeff Pang 2013 Simson Garfinkel @@ -71,21 +66,15 @@ added right after the 1.4.0 release of tcpflow -Files: src/uni/ethertype.h +Files: src/wifipcap/ethertype.h Copyright: 1993, 1994, 1996 The Regents of the University of California License: BSD-3clause-uni -Files: src/uni/extract.h +Files: src/wifipcap/extract.h Copyright: 1992, 1993, 1994, 1995, 1996 The Regents of the University of California License: BSD-3clause-uni -Files: src/uni/ieee802_11.h -Copyright: 2001 Fortress Technologies - Charlie Lenahan - 2013 Simson Garfinkel -License: BSD-3clause-uni - -Files: src/uni/llc.h +Files: src/wifipcap/llc.h Copyright: 1993, 1994, 1997 The Regents of the University of California License: BSD-3clause-uni @@ -107,18 +96,13 @@ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. -Files: src/uni/ieee802_11_radio.h +Files: src/wifipcap/ieee802_11_radio.h Copyright: 2003, 2004 David Young License: BSD-3clause-Young -Files: src/uni/oui.h src/wifipcap/oui.h -Copyright: 2006-2007 Doug Madory - 2008 Jeff Pang -License: BSD-2clause-MadoryPang - -Files: src/uni/packet-ieee80211.h -Copyright: 2000 Axis Communications AB -License: GPL-2+ +Files: src/wifipcap/oui.h +Copyright: 2006 Hannes Gredler +License: BSD-2clause Files: src/netviz/* Copyright: 2012-2013 Michael Shick @@ -238,6 +222,18 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +License: BSD-2clause + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that: (1) source code + distributions retain the above copyright notice and this paragraph + in its entirety, and (2) distributions including binary code include + the above copyright notice and this paragraph in its entirety in + the documentation or other materials provided with the distribution. + THIS SOFTWARE IS PROVIDED ``AS IS'' AND + WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT + LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE. + Files: src/be13_api/utf8/* src/be13_api/utf8.h Copyright: 2006 Nemanja Trifunovic @@ -273,41 +269,17 @@ License: public-domain Confirmed by email to the author -Files: src/http-parser/* -Copyright: Igor Sysoev - Joyent, Inc. and other Node contributors -License: Expat - http_parser.c is based on src/http/ngx_http_parse.c from NGINX copyright - Igor Sysoev. - . - Additional changes are licensed under the same terms as NGINX and - copyright Joyent, Inc. and other Node contributors. All rights reserved. - . - Permission is hereby granted, free of charge, to any person obtaining a - copy of this software and associated documentation files (the "Software"), - to deal in the Software without restriction, including without limitation - the rights to use, copy, modify, merge, publish, distribute, sublicense, - and/or sell copies of the Software, and to permit persons to whom the - Software is furnished to do so, subject to the following conditions: - . - The above copyright notice and this permission notice shall be included - in all copies or substantial portions of the Software. - . - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - Files: debian/* Copyright: 2012-2013 Dima Kogan 2000 Robert McQueen 2006-2010 Romain Francoise License: GPL-3 +Files: * +Copyright: 1999-2003 Jeremy Elson + 2012-2013 Simson Garfinkel +License: GPL-3 + License: GPL-3 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License (version 3) as published by the diff -Nru tcpflow-1.4.4+repack1/debian/patches/0001-tcpflow-b-0-now-works.patch tcpflow-1.4.5+repack1/debian/patches/0001-tcpflow-b-0-now-works.patch --- tcpflow-1.4.4+repack1/debian/patches/0001-tcpflow-b-0-now-works.patch 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0001-tcpflow-b-0-now-works.patch 2015-08-26 04:07:11.000000000 +0000 @@ -0,0 +1,87 @@ +From 7c32381b6548e407fc3d0f3a63ccf1b6b12dadfd Mon Sep 17 00:00:00 2001 +From: Dima Kogan +Date: Thu, 5 Mar 2015 15:01:07 -0800 +Subject: [PATCH] 'tcpflow -b 0' now works +Forwarded: https://github.com/simsong/tcpflow/issues/95 + +This creates length-0 flow files that act as binary success/failure indicators +--- + src/tcpdemux.h | 4 ++-- + src/tcpip.cpp | 22 +++++++++++++--------- + 2 files changed, 15 insertions(+), 11 deletions(-) + +diff --git a/src/tcpdemux.h b/src/tcpdemux.h +index 858c50b..09c5970 100644 +--- a/src/tcpdemux.h ++++ b/src/tcpdemux.h +@@ -91,7 +91,7 @@ public: + options():console_output(false),console_output_nonewline(false), + store_output(true),opt_md5(false), + post_processing(false),gzip_decompress(true), +- max_bytes_per_flow(), ++ max_bytes_per_flow(-1), + max_flows(0),suppress_header(0), + output_strip_nonprint(true),output_hex(false),use_color(0), + output_packet_index(false),max_seek(MAX_SEEK) { +@@ -102,7 +102,7 @@ public: + bool opt_md5; // do we calculate MD5 on DFXML output? + bool post_processing; // decode headers after tcp connection closes + bool gzip_decompress; +- uint64_t max_bytes_per_flow; ++ int64_t max_bytes_per_flow; + uint32_t max_flows; + bool suppress_header; + bool output_strip_nonprint; +diff --git a/src/tcpip.cpp b/src/tcpip.cpp +index 70d9ef5..754230b 100644 +--- a/src/tcpip.cpp ++++ b/src/tcpip.cpp +@@ -236,10 +236,12 @@ void tcpip::print_packet(const u_char *data, uint32_t length) + /* green, blue, read */ + const char *color[3] = { "\033[0;32m", "\033[0;34m", "\033[0;31m" }; + +- if(demux.opt.max_bytes_per_flow>0){ +- if(last_byte > demux.opt.max_bytes_per_flow) return; /* too much has been printed */ +- if(length > demux.opt.max_bytes_per_flow - last_byte){ +- length = demux.opt.max_bytes_per_flow - last_byte; /* can only output this much */ ++ if(demux.opt.max_bytes_per_flow>=0){ ++ uint64_t max_bytes_per_flow = (uint64_t)demux.opt.max_bytes_per_flow; ++ ++ if(last_byte > max_bytes_per_flow) return; /* too much has been printed */ ++ if(length > max_bytes_per_flow - last_byte){ ++ length = max_bytes_per_flow - last_byte; /* can only output this much */ + if(length==0) return; + } + } +@@ -419,13 +421,15 @@ void tcpip::store_packet(const u_char *data, uint32_t length, int32_t delta,stru + * but remember to seek out to the actual position after the truncated write... + */ + uint32_t wlength = length; // length to write +- if (demux.opt.max_bytes_per_flow){ +- if(offset >= demux.opt.max_bytes_per_flow){ ++ if (demux.opt.max_bytes_per_flow >= 0){ ++ uint64_t max_bytes_per_flow = (uint64_t)demux.opt.max_bytes_per_flow; ++ ++ if(offset >= max_bytes_per_flow){ + wlength = 0; + } +- if(offset < demux.opt.max_bytes_per_flow && offset+length > demux.opt.max_bytes_per_flow){ ++ if(offset < max_bytes_per_flow && offset+length > max_bytes_per_flow){ + DEBUG(2) ("packet truncated by max_bytes_per_flow on %s", flow_pathname.c_str()); +- wlength = demux.opt.max_bytes_per_flow - offset; ++ wlength = max_bytes_per_flow - offset; + } + } + +@@ -434,7 +438,7 @@ void tcpip::store_packet(const u_char *data, uint32_t length, int32_t delta,stru + * save the return value because open_tcpfile() puts the file pointer + * into the structure for us. + */ +- if (fd < 0 && wlength>0) { ++ if (fd < 0) { + if (open_file()) { + DEBUG(1)("unable to open TCP file %s fd=%d wlength=%d", + flow_pathname.c_str(),fd,(int)wlength); +-- +2.1.4 + diff -Nru tcpflow-1.4.4+repack1/debian/patches/0001-Update-README-for-1.4.5.patch tcpflow-1.4.5+repack1/debian/patches/0001-Update-README-for-1.4.5.patch --- tcpflow-1.4.4+repack1/debian/patches/0001-Update-README-for-1.4.5.patch 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0001-Update-README-for-1.4.5.patch 2015-08-26 04:13:32.000000000 +0000 @@ -0,0 +1,25 @@ +From 83ff311c1d9400f84999170506df8e454c2a49bb Mon Sep 17 00:00:00 2001 +From: Mike Dillon +Date: Mon, 24 Aug 2015 09:50:41 -0700 +Subject: [PATCH] Update README for 1.4.5 +Forwarded: https://github.com/simsong/tcpflow/pull/105 + +--- + README.md | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/README.md b/README.md +index 037935d..c64dafa 100644 +--- a/README.md ++++ b/README.md +@@ -1,5 +1,5 @@ +-TCPFLOW 1.3 +-=========== ++TCPFLOW 1.4.5 ++============= + Downloads directory: http://www.digitalcorpora.org/downloads/tcpflow/ + + +-- +2.1.4 + diff -Nru tcpflow-1.4.4+repack1/debian/patches/0001-using-the-debian-package-of-libhttp-parser-instead-o.patch tcpflow-1.4.5+repack1/debian/patches/0001-using-the-debian-package-of-libhttp-parser-instead-o.patch --- tcpflow-1.4.4+repack1/debian/patches/0001-using-the-debian-package-of-libhttp-parser-instead-o.patch 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0001-using-the-debian-package-of-libhttp-parser-instead-o.patch 2015-08-26 03:53:34.000000000 +0000 @@ -8,12 +8,12 @@ src/scan_http.cpp | 2 +- 2 files changed, 3 insertions(+), 9 deletions(-) -diff --git a/src/Makefile.am b/src/Makefile.am -index 67a6fa4..a80fc09 100644 ---- a/src/Makefile.am -+++ b/src/Makefile.am -@@ -75,18 +75,12 @@ tcpflow_SOURCES = \ - scan_wifiviz.cpp \ +Index: tcpflow/src/Makefile.am +=================================================================== +--- tcpflow.orig/src/Makefile.am ++++ tcpflow/src/Makefile.am +@@ -88,18 +88,12 @@ tcpflow_SOURCES = \ + scan_netviz.cpp \ pcap_writer.h \ iptree.h \ - http-parser/http_parser.c \ @@ -33,10 +33,10 @@ wifipcap/README.txt \ wifipcap/TimeVal.cpp \ wifipcap/TimeVal.h \ -diff --git a/src/scan_http.cpp b/src/scan_http.cpp -index 78abb98..2460694 100644 ---- a/src/scan_http.cpp -+++ b/src/scan_http.cpp +Index: tcpflow/src/scan_http.cpp +=================================================================== +--- tcpflow.orig/src/scan_http.cpp ++++ tcpflow/src/scan_http.cpp @@ -11,7 +11,7 @@ #include "tcpip.h" #include "tcpdemux.h" diff -Nru tcpflow-1.4.4+repack1/debian/patches/0002-added-configure-option-to-disable-libcairo.patch tcpflow-1.4.5+repack1/debian/patches/0002-added-configure-option-to-disable-libcairo.patch --- tcpflow-1.4.4+repack1/debian/patches/0002-added-configure-option-to-disable-libcairo.patch 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0002-added-configure-option-to-disable-libcairo.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,71 +0,0 @@ -From: Dima Kogan -Date: Sat, 24 May 2014 14:55:27 -0700 -Subject: added configure option to disable libcairo -Forwarded: yes. merged upstream at https://github.com/simsong/tcpflow/commit/dcd8a9422f7919de17eb44d0f6f256b919df113b - ---- - configure.ac | 43 +++++++++++++++++++++++++------------------ - 1 file changed, 25 insertions(+), 18 deletions(-) - -diff --git a/configure.ac b/configure.ac -index c6dab00..883cd9a 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -285,32 +285,39 @@ AC_LANG_POP() - ################################################################ - # drawing support via cairo - # -+cairo=test -+AC_ARG_ENABLE([cairo],[ --enable-cairo=false to disable libcairo even if present]) -+ if test "${enable_cairo}" = false ; then -+ cairo=false -+ fi - --# Cairo requires these to be explicitly included on mingw (and perhaps others): --AC_CHECK_LIB([expat],[XML_ParserCreate]) --AC_CHECK_LIB([pixman-1],[pixman_region_init]) --AC_CHECK_LIB([bz2],[BZ2_bzDecompress]) --AC_CHECK_LIB([freetype],[FT_Init_FreeType]) # requires bz2 --AC_CHECK_LIB([fontconfig],[FcBlanksCreate]) # requires freetype expat -+if test $cairo = test ; then -+ # Cairo requires these to be explicitly included on mingw (and perhaps others): -+ AC_CHECK_LIB([expat],[XML_ParserCreate]) -+ AC_CHECK_LIB([pixman-1],[pixman_region_init]) -+ AC_CHECK_LIB([bz2],[BZ2_bzDecompress]) -+ AC_CHECK_LIB([freetype],[FT_Init_FreeType]) # requires bz2 -+ AC_CHECK_LIB([fontconfig],[FcBlanksCreate]) # requires freetype expat - --AC_CHECK_HEADERS([cairo/cairo.h cairo/cairo-pdf.h]) --AC_CHECK_HEADERS([cairo.h cairo-pdf.h]) --AC_CHECK_LIB([cairo],[cairo_create], , [ -- AC_MSG_WARN([ --*** cairo libraries not detected. --*** Please install cairo-devel to get 1-page PDF summary generation. --]) -- Fmissing_library="cairo-devel $missing_library " -- Umissing_library="libcairo2-dev $missing_library " -- Mmissing_library="cairo-devel " --]) -+ AC_CHECK_HEADERS([cairo/cairo.h cairo/cairo-pdf.h]) -+ AC_CHECK_HEADERS([cairo.h cairo-pdf.h]) -+ AC_CHECK_LIB([cairo],[cairo_create], , [ -+ AC_MSG_WARN([ -+ *** cairo libraries not detected. -+ *** Please install cairo-devel to get 1-page PDF summary generation. -+ ]) -+ Fmissing_library="cairo-devel $missing_library " -+ Umissing_library="libcairo2-dev $missing_library " -+ Mmissing_library="cairo-devel " -+ ]) -+fi - - ################################################################ - # pcap support. A bit more involved than normal due to the error message - # - pcap=test - AC_ARG_ENABLE([pcap],[ --enable-pcap=false to disable libpcap even if present]) -- if test "${enableval}" = false ; then -+ if test "${enable_pcap}" = false ; then - pcap=false - fi - diff -Nru tcpflow-1.4.4+repack1/debian/patches/0003-fixed-handling-of-fputc.patch tcpflow-1.4.5+repack1/debian/patches/0003-fixed-handling-of-fputc.patch --- tcpflow-1.4.4+repack1/debian/patches/0003-fixed-handling-of-fputc.patch 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0003-fixed-handling-of-fputc.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -From: Simson Garfinkel -Date: Sun, 16 Feb 2014 11:24:01 -0500 -Subject: fixed handling of fputc -Forwarded: yes. merged upstream at https://github.com/simsong/tcpflow/commit/6cfe1a4905b8801084ecff00ae794388128501af - ---- - src/tcpip.cpp | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/src/tcpip.cpp b/src/tcpip.cpp -index 603c127..ecad602 100644 ---- a/src/tcpip.cpp -+++ b/src/tcpip.cpp -@@ -262,7 +262,13 @@ void tcpip::print_packet(const u_char *data, uint32_t length) - else if(demux.opt.output_strip_nonprint){ - for(const u_char *cc = data;cc -Date: Sun, 16 Feb 2014 13:48:41 -0500 -Subject: fixed handling of fputc -Forwarded: yes. merged upstream at https://github.com/simsong/tcpflow/commit/1aa7d8e5072175859b0a3c41264c84de14b25e55 ---- - src/tcpip.cpp | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/tcpip.cpp b/src/tcpip.cpp -index ecad602..86ad6df 100644 ---- a/src/tcpip.cpp -+++ b/src/tcpip.cpp -@@ -268,7 +268,7 @@ void tcpip::print_packet(const u_char *data, uint32_t length) - exit(1); - - } -- written += ret; -+ written += 1; - } - } - } diff -Nru tcpflow-1.4.4+repack1/debian/patches/0005-fixed-per-https-bugs.debian.org-cgi-bin-bugreport.cg.patch tcpflow-1.4.5+repack1/debian/patches/0005-fixed-per-https-bugs.debian.org-cgi-bin-bugreport.cg.patch --- tcpflow-1.4.4+repack1/debian/patches/0005-fixed-per-https-bugs.debian.org-cgi-bin-bugreport.cg.patch 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0005-fixed-per-https-bugs.debian.org-cgi-bin-bugreport.cg.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -From: Simson Garfinkel -Date: Fri, 30 May 2014 07:07:50 -0400 -Subject: fixed per https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=736417 -Forwarded: yes. merged upstream at https://github.com/simsong/tcpflow/commit/7df3db6874fb1a596c1e9673eabaf15be81e59be - ---- - src/tcpip.cpp | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/src/tcpip.cpp b/src/tcpip.cpp -index 86ad6df..8a37413 100644 ---- a/src/tcpip.cpp -+++ b/src/tcpip.cpp -@@ -268,8 +268,9 @@ void tcpip::print_packet(const u_char *data, uint32_t length) - exit(1); - - } -- written += 1; - } -+ written += 1; // treat even unprintable characters as "written". It -+ // really means "processed" - } - } - else { diff -Nru tcpflow-1.4.4+repack1/debian/patches/0006-fixed-so-that-it-properly-gets-default-device-if-no-.patch tcpflow-1.4.5+repack1/debian/patches/0006-fixed-so-that-it-properly-gets-default-device-if-no-.patch --- tcpflow-1.4.4+repack1/debian/patches/0006-fixed-so-that-it-properly-gets-default-device-if-no-.patch 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/0006-fixed-so-that-it-properly-gets-default-device-if-no-.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -From: Simson Garfinkel -Date: Sun, 2 Feb 2014 16:18:45 -0500 -Subject: fixed so that it properly gets default device if no -i is given -Forwarded: yes. merged upstream at https://github.com/simsong/tcpflow/commit/9abeb43451fff5543efdac769d1efea52d4dd084 - ---- - src/tcpflow.cpp | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/src/tcpflow.cpp b/src/tcpflow.cpp -index 96bd38e..f5bd746 100644 ---- a/src/tcpflow.cpp -+++ b/src/tcpflow.cpp -@@ -25,9 +25,6 @@ - #include - #include - -- -- -- - /* bring in inet_ntop if it is not present */ - #define ETH_ALEN 6 - #ifndef HAVE_INET_NTOP -@@ -388,7 +385,7 @@ int main(int argc, char *argv[]) - #endif - - bool force_binary_output = false; -- const char *device = ""; -+ const char *device = 0; // default device - const char *lockname = 0; - int need_usage = 0; - std::string reportfilename; -@@ -649,7 +646,10 @@ int main(int argc, char *argv[]) - be13::plugin::get_scanner_feature_file_names(feature_file_names); - feature_recorder_set fs(0); - -- fs.init(feature_file_names,input_fname.size()>0 ? input_fname : device,demux.outdir); -+ const char *name = device; -+ if(input_fname.size()>0) name=input_fname.c_str(); -+ if(name==0) name=""; -+ fs.init(feature_file_names,name,demux.outdir); - the_fs = &fs; - demux.fs = &fs; - diff -Nru tcpflow-1.4.4+repack1/debian/patches/packing-struct-tcphdr-to-improve-portability.patch tcpflow-1.4.5+repack1/debian/patches/packing-struct-tcphdr-to-improve-portability.patch --- tcpflow-1.4.4+repack1/debian/patches/packing-struct-tcphdr-to-improve-portability.patch 2014-05-31 19:29:40.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/packing-struct-tcphdr-to-improve-portability.patch 2015-08-26 03:53:34.000000000 +0000 @@ -10,9 +10,9 @@ Index: tcpflow/src/be13_api/bulk_extractor_i.h =================================================================== ---- tcpflow.orig/src/be13_api/bulk_extractor_i.h 2014-01-12 23:57:16.202181700 -0800 -+++ tcpflow/src/be13_api/bulk_extractor_i.h 2014-01-12 23:57:45.030324635 -0800 -@@ -279,7 +279,7 @@ +--- tcpflow.orig/src/be13_api/bulk_extractor_i.h ++++ tcpflow/src/be13_api/bulk_extractor_i.h +@@ -278,7 +278,7 @@ namespace be13 { uint16_t th_win; /* window */ uint16_t th_sum; /* checksum */ uint16_t th_urp; /* urgent pointer */ diff -Nru tcpflow-1.4.4+repack1/debian/patches/series tcpflow-1.4.5+repack1/debian/patches/series --- tcpflow-1.4.4+repack1/debian/patches/series 2014-05-31 19:49:23.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/patches/series 2015-08-26 04:36:44.000000000 +0000 @@ -1,7 +1,4 @@ +0001-Update-README-for-1.4.5.patch +0001-tcpflow-b-0-now-works.patch 0001-using-the-debian-package-of-libhttp-parser-instead-o.patch -0002-added-configure-option-to-disable-libcairo.patch -0003-fixed-handling-of-fputc.patch -0004-fixed-handling-of-fputc.patch -0005-fixed-per-https-bugs.debian.org-cgi-bin-bugreport.cg.patch -0006-fixed-so-that-it-properly-gets-default-device-if-no-.patch packing-struct-tcphdr-to-improve-portability.patch diff -Nru tcpflow-1.4.4+repack1/debian/README.source tcpflow-1.4.5+repack1/debian/README.source --- tcpflow-1.4.4+repack1/debian/README.source 2014-05-31 19:15:59.000000000 +0000 +++ tcpflow-1.4.5+repack1/debian/README.source 2015-08-26 04:15:16.000000000 +0000 @@ -3,14 +3,6 @@ include these. One of the submodules (http-parser) exists in Debian as a separate package, so I'm using that, and repacking the 2 other submodules. -Additionally, the dfxml submodule in the tcpflow 1.4.4 source had some erroneous -out-of-tree symlinks. This was fixed by a later patch: - - https://github.com/simsong/dfxml/commit/4d54ca2db2eb06a4a3074111be5b784568d86bcb - -This patch converts symlinks into actual files, which is something dpkg-source -doesn't support. Thus this patch is included in the repacked source also - So in summary, this tarball contains - The full source of tcpflow @@ -19,12 +11,12 @@ The source tarball was made with something like this: - $ git clone https://github.com/simsong/tcpflow.git - $ cd tcpflow - $ git reset --hard tcpflow-1.4.4 - $ git submodule init - $ git submodule update - $ cd .. - $ tar cz --exclude '*/.git' --exclude '*/src/http-parser/*' tcpflow > tcpflow_1.4.4+repack1.orig.tar.gz + git clone https://github.com/simsong/tcpflow.git + cd tcpflow + git reset --hard tcpflow-1.4.4 + git submodule init + git submodule update + cd .. + tar cz --exclude '*/.git' --exclude '*/src/http-parser/*' tcpflow > tcpflow_1.4.4+repack1.orig.tar.gz - -- Dima Kogan , Thu, 9 Jan 2014 21:32:38 -0800 + -- Dima Kogan , Tue, 25 Aug 2015 21:15:16 -0700 Binary files /tmp/uVvMFr2Tmv/tcpflow-1.4.4+repack1/doc/tcpflow-logo.pdf and /tmp/ioQinB_t6t/tcpflow-1.4.5+repack1/doc/tcpflow-logo.pdf differ diff -Nru tcpflow-1.4.4+repack1/m4/slg_gcc_all_warnings.m4 tcpflow-1.4.5+repack1/m4/slg_gcc_all_warnings.m4 --- tcpflow-1.4.4+repack1/m4/slg_gcc_all_warnings.m4 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/m4/slg_gcc_all_warnings.m4 2015-08-26 03:34:50.000000000 +0000 @@ -0,0 +1,106 @@ +################################################################ +# +# Enable all the compiler debugging we can find +# Simson L. Garfinkel +# +# This is originally from PhotoRec, but modified substantially by Simson +# Figure out which flags we can use with the compiler. +# +# These I don't like: +# -Wdeclaration-after-statement -Wconversion +# doesn't work: -Wunreachable-code +# causes configure to crash on gcc-4.2.1: -Wsign-compare-Winline +# causes warnings with unistd.h: -Wnested-externs +# Just causes too much annoyance: -Wmissing-format-attribute + +# First, see if we are using CLANG +using_clang=no +if (g++ --version 2>&1 | grep clang > /dev/null) ; +then + AC_MSG_NOTICE([g++ is really clang++]) + using_clang=yes +fi +if test x$CXX == "xclang++" ; then + using_clang=yes +fi + + + +# Check GCC +C_WARNINGS_TO_TEST="-MD -Wpointer-arith -Wmissing-declarations -Wmissing-prototypes \ + -Wshadow -Wwrite-strings -Wcast-align -Waggregate-return \ + -Wbad-function-cast -Wcast-qual -Wundef -Wredundant-decls -Wdisabled-optimization \ + -Wfloat-equal -Wmultichar -Wc++-compat -Wmissing-noreturn " + +if test x"${mingw}" != "xyes" ; then + # add the warnings we do not want to do on mingw + C_WARNINGS_TO_TEST="$C_WARNINGS_TO_TEST -Wall -Wstrict-prototypes" +fi + +if test $using_clang == "no" ; then + # -Wstrict-null-sentinel is not supported under clang + CXX_WARNINGS_TO_TEST="$CXX_WARNINGS_TO_TEST -Wstrict-null-sentinel" +fi + + + +echo "C Warnings to test: $C_WARNINGS_TO_TEST" + +for option in $C_WARNINGS_TO_TEST +do + SAVE_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $option" + AC_MSG_CHECKING([whether gcc understands $option]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[]])], + [has_option=yes], + [has_option=no; CFLAGS="$SAVE_CFLAGS"]) + AC_MSG_RESULT($has_option) + unset has_option + unset SAVE_CFLAGS + if test $option = "-Wmissing-format-attribute" ; then + AC_DEFINE(HAVE_MISSING_FORMAT_ATTRIBUTE_WARNING,1, + [Indicates that we have the -Wmissing-format-attribute G++ warning]) + fi +done +unset option + + +# Check G++ +# We don't use these warnings: +# -Waggregate-return -- aggregate returns are GOOD; they simplify code design +# We can use these warnings after ZLIB gets upgraded: +# -Wundef --- causes problems with zlib +# -Wcast-qual +# -Wmissing-format-attribute --- Just too annoying +AC_LANG_PUSH(C++) +AC_CHECK_HEADERS([string]) +CXX_WARNINGS_TO_TEST="-Wall -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith \ + -Wshadow -Wwrite-strings -Wcast-align \ + -Wredundant-decls -Wdisabled-optimization \ + -Wfloat-equal -Wmultichar -Wmissing-noreturn \ + -Woverloaded-virtual -Wsign-promo \ + -funit-at-a-time" + +if test x"${mingw}" != "xyes" ; then + # add the warnings we don't want to do on mingw + CXX_WARNINGS_TO_TEST="$CXX_WARNINGS_TO_TEST -Weffc++" +fi + +echo "C++ Warnings to test: $CXX_WARNINGS_TO_TEST" + +for option in $CXX_WARNINGS_TO_TEST +do + SAVE_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $option" + AC_MSG_CHECKING([whether g++ understands $option]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[]])], + [has_option=yes], + [has_option=no; CXXFLAGS="$SAVE_CXXFLAGS"]) + AC_MSG_RESULT($has_option) + unset has_option + unset SAVE_CXXFLAGS +done +unset option +AC_LANG_POP() + + diff -Nru tcpflow-1.4.4+repack1/m4/slg_searchdirs.m4 tcpflow-1.4.5+repack1/m4/slg_searchdirs.m4 --- tcpflow-1.4.4+repack1/m4/slg_searchdirs.m4 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/m4/slg_searchdirs.m4 2015-08-26 03:34:50.000000000 +0000 @@ -0,0 +1,26 @@ +if test x"${mingw}" != "xyes" ; then + + case $host in + *mingw*) + AC_MSG_NOTICE([Compiling under mingw; will not search other directories.]) + ;; + *) + AC_MSG_NOTICE(Compiling under $host.) + # Bring additional directories where things might be found into our + # search path. I don't know why autoconf doesn't do this by default + for spfx in /usr/local /opt/local /sw /usr/local/ssl; do + AC_MSG_NOTICE([checking ${spfx}/include]) + if test -d ${spfx}/include; then + CPPFLAGS="-I${spfx}/include $CPPFLAGS" + LDFLAGS="-L${spfx}/lib $LDFLAGS" + AC_MSG_NOTICE([ *** ADDING ${spfx}/include to CPPFLAGS *** ]) + AC_MSG_NOTICE([ *** ADDING ${spfx}/lib to LDFLAGS *** ]) + fi + done + AC_MSG_NOTICE([ CPPFLAGS = ${CPPFLAGS} ]) + AC_MSG_NOTICE([ LDFLAGS = ${LDFLAGS} ]) + ;; + esac +fi + + diff -Nru tcpflow-1.4.4+repack1/Makefile.am tcpflow-1.4.5+repack1/Makefile.am --- tcpflow-1.4.4+repack1/Makefile.am 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/Makefile.am 2015-08-26 03:34:50.000000000 +0000 @@ -54,20 +54,25 @@ mingw32-configure make clean make + mv -f src/tcpflow.exe tcpflow32.exe tcpflow64.exe: mingw64-configure make clean make + mv -f src/tcpflow.exe tcpflow64.exe -FN=$(PACKAGE_TARNAME)-$(PACKAGE_VERSION) -winrelease: - /bin/rm -f tcpflow*.exe $(FN).zip +FN=$(PACKAGE_TARNAME)-$(PACKAGE_VERSION).zip +$(FN): tcpflow32.exe tcpflow64.exe + /bin/rm -f tcpflow*.exe $(FN) make tcpflow32.exe - mv -f src/tcpflow.exe tcpflow32.exe make tcpflow64.exe - mv -f src/tcpflow.exe tcpflow64.exe - zip $(FN).zip tcpflow*.exe + zip $(FN) tcpflow32.exe tcpflow64.exe + +winrelease: $(FN) + +pub: $(FN) + scp $(FN) dcorpora@digitalcorpora.org:downloads/tcpflow/ .PHONY: pull diff -Nru tcpflow-1.4.4+repack1/README tcpflow-1.4.5+repack1/README --- tcpflow-1.4.4+repack1/README 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/README 2015-08-26 03:35:28.000000000 +0000 @@ -7,9 +7,13 @@ --------- To compile for Linux -Be sure you have the necessary precursors: +Be sure you have the necessary precursors. For RedHat based distributions use following command to install them: # yum -y install git gcc-c++ automake autoconf boost-devel cairo-devel libpcap-devel zlib-devel + +If you are working on a Debian based distribution use this: + + # sudo apt-get install git gcc g++ automake autoconf libpcap-dev libboost-dev libssl-dev zlib1g-dev libcairo2-dev Download the release from http://digitalcorpora.org/downloads/tcpflow/. Compile and install with: @@ -21,7 +25,7 @@ git clone --recursive https://github.com/simsong/tcpflow.git cd tcpflow - sh bootstrap.sh + bash bootstrap.sh ./configure make sudo make install @@ -30,7 +34,7 @@ To download and compile for Amazon AMI: ssh ec2-user@ - sudo bash yum -y install git make gcc-c++ automake autoconf boost-devel cairo-devel libpcap-devel zlib-devel + sudo bash yum -y install git make gcc-c++ automake autoconf boost-devel cairo-devel libpcap-devel openssl-devel zlib-devel git clone --recursive https://github.com/simsong/tcpflow.git sh bootstrap.sh @@ -95,7 +99,7 @@ You can change the template that is used to create filenames with the -F and -T options. If a directory appears in the template the directory will be automatically created. -If you use the -a option, tcpflow will automatically interpert HTTP responses. +If you use the -a option, tcpflow will automatically interpret HTTP responses. If the output file is 208.111.153.175.00080-192.168.001.064.37314, diff -Nru tcpflow-1.4.4+repack1/README.md tcpflow-1.4.5+repack1/README.md --- tcpflow-1.4.4+repack1/README.md 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/README.md 2015-08-26 03:35:28.000000000 +0000 @@ -7,9 +7,13 @@ --------- To compile for Linux -Be sure you have the necessary precursors: +Be sure you have the necessary precursors. For RedHat based distributions use following command to install them: # yum -y install git gcc-c++ automake autoconf boost-devel cairo-devel libpcap-devel zlib-devel + +If you are working on a Debian based distribution use this: + + # sudo apt-get install git gcc g++ automake autoconf libpcap-dev libboost-dev libssl-dev zlib1g-dev libcairo2-dev Download the release from http://digitalcorpora.org/downloads/tcpflow/. Compile and install with: @@ -21,7 +25,7 @@ git clone --recursive https://github.com/simsong/tcpflow.git cd tcpflow - sh bootstrap.sh + bash bootstrap.sh ./configure make sudo make install @@ -30,7 +34,7 @@ To download and compile for Amazon AMI: ssh ec2-user@ - sudo bash yum -y install git make gcc-c++ automake autoconf boost-devel cairo-devel libpcap-devel zlib-devel + sudo bash yum -y install git make gcc-c++ automake autoconf boost-devel cairo-devel libpcap-devel openssl-devel zlib-devel git clone --recursive https://github.com/simsong/tcpflow.git sh bootstrap.sh @@ -95,7 +99,7 @@ You can change the template that is used to create filenames with the -F and -T options. If a directory appears in the template the directory will be automatically created. -If you use the -a option, tcpflow will automatically interpert HTTP responses. +If you use the -a option, tcpflow will automatically interpret HTTP responses. If the output file is 208.111.153.175.00080-192.168.001.064.37314, diff -Nru tcpflow-1.4.4+repack1/src/be13_api/aftimer.h tcpflow-1.4.5+repack1/src/be13_api/aftimer.h --- tcpflow-1.4.4+repack1/src/be13_api/aftimer.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/aftimer.h 2015-08-26 03:35:59.000000000 +0000 @@ -39,7 +39,7 @@ * http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 */ -#ifdef WIN32 +#if defined(WIN32) || defined(__MINGW32__) # include # include # ifndef DELTA_EPOCH_IN_MICROSECS diff -Nru tcpflow-1.4.4+repack1/src/be13_api/atomic_set_map.h tcpflow-1.4.5+repack1/src/be13_api/atomic_set_map.h --- tcpflow-1.4.4+repack1/src/be13_api/atomic_set_map.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/atomic_set_map.h 2015-08-26 03:35:59.000000000 +0000 @@ -47,7 +47,9 @@ public: atomic_histogram():amap(),M(){}; - typedef void (*dump_callback_t)(void *user,const TYPE &val,const CTYPE &count); + // The callback is used to report the histogram. + // The callback returns '0' if no error is encountered, '-1' if the dumping should stop + typedef int (*dump_callback_t)(void *user,const TYPE &val,const CTYPE &count); // add and return the count // http://www.cplusplus.com/reference/unordered_map/unordered_map/insert/ CTYPE add(const TYPE &val,const CTYPE &count){ @@ -64,7 +66,8 @@ void dump(void *user,dump_callback_t dump_cb) const{ cppmutex::lock lock(M); for(typename hmap_t::const_iterator it = amap.begin();it!=amap.end();it++){ - (*dump_cb)(user,(*it).first,(*it).second); + int ret = (*dump_cb)(user,(*it).first,(*it).second); + if(ret<0) return; } } struct ReportElement { @@ -81,7 +84,7 @@ }; typedef std::vector< const ReportElement *> element_vector_t; - void dump_sorted(void *user,dump_callback_t dump_cb) const{ + void dump_sorted(void *user,dump_callback_t dump_cb) const { /* Create a list of new elements, sort it, then report the sorted list */ element_vector_t evect; { @@ -92,8 +95,9 @@ } std::sort(evect.begin(),evect.end(),ReportElement::compare); for(typename element_vector_t::const_iterator it = evect.begin();it!=evect.end();it++){ - (*dump_cb)(user,(*it)->value,(*it)->tally); + int ret = (*dump_cb)(user,(*it)->value,(*it)->tally); delete *it; + if(ret<0) break; } } diff -Nru tcpflow-1.4.4+repack1/src/be13_api/be13_configure.m4 tcpflow-1.4.5+repack1/src/be13_api/be13_configure.m4 --- tcpflow-1.4.4+repack1/src/be13_api/be13_configure.m4 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/be13_configure.m4 2015-08-26 03:35:59.000000000 +0000 @@ -3,8 +3,11 @@ # AC_MSG_NOTICE([Including be13_configure.m4 from be13_api]) -AC_CHECK_HEADERS([err.h pwd.h sys/cdefs.h sys/mman.h sys/resource.h sys/utsname.h unistd.h ]) -AC_CHECK_FUNCS([ishexnumber isxdigit unistd.h mmap err errx warn warnx pread64 pread strptime _lseeki64 utimes ]) +AC_CHECK_HEADERS([err.h pwd.h sys/cdefs.h sys/mman.h sys/resource.h sys/utsname.h unistd.h sqlite3.h ]) +AC_CHECK_FUNCS([gmtime_r ishexnumber isxdigit localtime_r unistd.h mmap err errx warn warnx pread64 pread strptime _lseeki64 utimes ]) + +AC_CHECK_LIB([sqlite3],[sqlite3_libversion]) +AC_CHECK_FUNCS([sqlite3_create_function_v2]) AC_TRY_COMPILE([#pragma GCC diagnostic ignored "-Wredundant-decls"],[int a=3;], [AC_DEFINE(HAVE_DIAGNOSTIC_REDUNDANT_DECLS,1,[define 1 if GCC supports -Wredundant-decls])] @@ -21,7 +24,10 @@ # Figure out which version of unordered_map we are going to use # AC_LANG_PUSH(C++) + AC_MSG_NOTICE([checking for unordered_map]) + AC_MSG_NOTICE([ CXXFLAGS: $CXXFLAGS]) AC_CHECK_HEADERS([unordered_map unordered_set],[],[ AC_CHECK_HEADERS([tr1/unordered_map tr1/unordered_set])]) + AC_MSG_NOTICE([done]) AC_LANG_POP() diff -Nru tcpflow-1.4.4+repack1/src/be13_api/beregex.cpp tcpflow-1.4.5+repack1/src/be13_api/beregex.cpp --- tcpflow-1.4.4+repack1/src/be13_api/beregex.cpp 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/beregex.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -94,9 +94,9 @@ if(r!=0) return 0; /* some kind of failure */ /* Make copies of the first group */ if(pmatch[1].rm_so != pmatch[1].rm_eo){ - if(found) *found = line.substr(pmatch[1].rm_so,pmatch[1].rm_eo-pmatch[1].rm_so); + if(found) *found = line.substr(pmatch[1].rm_so,pmatch[1].rm_eo-pmatch[1].rm_so); if(offset) *offset = pmatch[1].rm_so; - if(len) *len = pmatch[1].rm_eo-pmatch[1].rm_so; + if(len) *len = pmatch[1].rm_eo-pmatch[1].rm_so; } return 1; /* success */ } @@ -110,7 +110,7 @@ for(int i=0;i #include #include diff -Nru tcpflow-1.4.4+repack1/src/be13_api/bulk_extractor_i.h tcpflow-1.4.5+repack1/src/be13_api/bulk_extractor_i.h --- tcpflow-1.4.4+repack1/src/be13_api/bulk_extractor_i.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/bulk_extractor_i.h 2015-08-26 03:35:59.000000000 +0000 @@ -28,7 +28,13 @@ #include -#ifdef WIN32 +#if defined(MINGW) || defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +#ifndef WIN32 +#define WIN32 +#endif +#endif + +#if defined(WIN32) || defined(__MINGW32__) # include # include # include @@ -76,19 +82,12 @@ #include "sbuf.h" #include "utf8.h" +#include "utils.h" // for gmtime_r #include #include #include -namespace be13 { - struct hash_def { - hash_def():name(),func(){}; - std::string name; // v3: (input) function to perform hashing with - std::string (*func)(const uint8_t *buf,size_t bufsize); // v3: (input) function to perform hashing with - }; -}; - #include "feature_recorder.h" #include "feature_recorder_set.h" @@ -625,7 +624,7 @@ static const int SCANNER_RECURSE_EXPAND = 0x020; // v3: recurses AND result is >= original size static const int SCANNER_WANTS_NGRAMS = 0x040; // v3: Scanner gets buffers that are constant n-grams static const int SCANNER_FAST_FIND = 0x080; // v3: This scanner is a very fast FIND scanner - static const int SCANNER_DEPTH_0 = 0x100; // v3: scanner only runs at detph 0 by default + static const int SCANNER_DEPTH_0 = 0x100; // v3: scanner only runs at depth 0 by default static const int CURRENT_SI_VERSION = 4; static const std::string flag_to_string(const int flag){ @@ -645,11 +644,10 @@ * Scanner histograms are added to 'histograms' by machinery. */ struct scanner_config { - scanner_config():namevals(),debug(),hasher() /* ,histograms() */{}; + scanner_config():namevals(),debug(){}; virtual ~scanner_config(){} config_t namevals; // v3: (input) name=val map int debug; // v3: (input) current debug level - struct be13::hash_def hasher; // v3: (input) hasher to use }; // never change the order or delete old fields, or else you will @@ -774,15 +772,15 @@ *** INSTANCE VARIABLES *** **************************/ - const int sp_version; /* version number of this structure */ - const phase_t phase; /* v1: 0=startup, 1=normal, 2=shutdown (changed to phase_t in v1.3) */ - const sbuf_t &sbuf; /* v1: what to scan / only valid in SCAN_PHASE */ - class feature_recorder_set &fs; /* v1: where to put the results / only valid in SCAN_PHASE */ - const uint32_t depth; /* v1: how far down are we? / only valid in SCAN_PHASE */ - - PrintOptions &print_options; /* v1: how to print / NOT USED IN SCANNERS */ - scanner_info *info; /* v2: set/get parameters on startup */ - std::stringstream *sxml; /* v3: on scanning and shutdown: CDATA added to XML stream (advanced feature) */ + const int sp_version; /* version number of this structure */ + const phase_t phase; /* v1: 0=startup, 1=normal, 2=shutdown (changed to phase_t in v1.3) */ + const sbuf_t &sbuf; /* v1: what to scan / only valid in SCAN_PHASE */ + class feature_recorder_set &fs; /* v1: where to put the results / only valid in SCAN_PHASE */ + const uint32_t depth; /* v1: how far down are we? / only valid in SCAN_PHASE */ + + PrintOptions &print_options; /* v1: how to print / NOT USED IN SCANNERS */ + scanner_info *info; /* v2: set/get parameters on startup, hasher */ + std::stringstream *sxml; /* v3: on scanning and shutdown: CDATA added to XML stream (advanced feature) */ }; @@ -885,6 +883,17 @@ return utf8_line; } +inline std::wstring safe_utf8to16(std::string s){ // needs to be cleaned up + std::wstring utf16_line; + try { + utf8::utf8to16(s.begin(),s.end(),back_inserter(utf16_line)); + } catch(utf8::invalid_utf8){ + /* Exception thrown: bad UTF16 encoding */ + utf16_line = L""; + } + return utf16_line; +} + // truncate string at the matching char inline void truncate_at(std::string &line, char ch) { size_t pos = line.find(ch); @@ -909,10 +918,6 @@ * gmtime_r() is Linux-specific. You'll find a copy in util.cpp for Windows. */ -#ifndef HAVE_GMTIME_R -void gmtime_r(time_t *t,struct tm *tm); -#endif - inline std::string microsoftDateToISODate(const uint64_t &time) { time_t tmp = (time / ONE_HUNDRED_NANO_SEC_TO_SECONDS) - SECONDS_BETWEEN_WIN32_EPOCH_AND_UNIX_EPOCH; @@ -924,12 +929,24 @@ return std::string(buf); } +/* Convert Unix timestamp to ISO format */ +inline std::string unixTimeToISODate(const uint64_t &t) +{ + struct tm time_tm; + time_t tmp=t; + gmtime_r(&tmp, &time_tm); + char buf[256]; + strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &time_tm); // Zulu time + return std::string(buf); +} + /* Many internal windows and Linux structures require a valid printable name in ASCII */ inline bool validASCIIName(const std::string &name) { for(size_t i = 0; i< name.size(); i++){ if(((u_char)name[i]) & 0x80) return false; // high bit should not be set if(((u_char)name[i]) < ' ') return false; // should not be control character + if(((u_char)name[i]) == 0x7f) return false; // DEL is not printable } return true; } diff -Nru tcpflow-1.4.4+repack1/src/be13_api/feature_recorder.cpp tcpflow-1.4.5+repack1/src/be13_api/feature_recorder.cpp --- tcpflow-1.4.4+repack1/src/be13_api/feature_recorder.cpp 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/feature_recorder.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -3,7 +3,6 @@ #include "config.h" #include "bulk_extractor_i.h" #include "unicode_escape.h" -#include "beregex.h" #include "histogram.h" #include @@ -37,39 +36,6 @@ uint32_t feature_recorder::debug=0; -void feature_recorder::banner_stamp(std::ostream &os,const std::string &header) -{ - int banner_lines = 0; - if(banner_file!=""){ - std::ifstream i(banner_file.c_str()); - if(i.is_open()){ - std::string line; - while(getline(i,line)){ - if(line.size()>0 && ((*line.end()=='\r') || (*line.end()=='\n'))){ - line.erase(line.end()); /* remove the last character while it is a \n or \r */ - } - os << "# " << line << "\n"; - banner_lines++; - } - i.close(); - } - } - if(banner_lines==0){ - os << "# BANNER FILE NOT PROVIDED (-b option)\n"; - } - - os << bulk_extractor_version_header; - os << "# Feature-Recorder: " << name << "\n"; - if(input_fname.size()) os << "# Filename: " << input_fname << "\n"; - if(debug!=0){ - os << "# DEBUG: " << debug << " ("; - if(debug & DEBUG_PEDANTIC) os << " DEBUG_PEDANTIC "; - os << ")\n"; - } - os << header; -} - - /** * Create a feature recorder object. Each recorder records a certain * kind of feature. Features are stored in a file. The filename is @@ -78,23 +44,23 @@ * and thus a different feature recorder, to avoid locking * problems. * - * @param outdir_ - where the feature file is written - * @param input_fname_ - the file (disk image) that these features were extracted from. - * - We should probably have a callback function to annotate the feature file. + * @param feature_recorder_set &fs - common information for all of the feature recorders * @param name - the name of the feature being recorded. */ feature_recorder::feature_recorder(class feature_recorder_set &fs_, - const std::string &outdir_,const std::string &input_fname_,const std::string &name_): + const std::string &name_): flags(0), - outdir(outdir_),input_fname(input_fname_),name(name_),ignore_encoding(),ios(), + name(name_),ignore_encoding(),ios(),bs(), histogram_defs(), fs(fs_), count_(0),context_window_before(context_window_default),context_window_after(context_window_default), - Mf(),Mr(),mhistogram(), + Mf(),Mr(),mhistograms(),mhistogram_limit(), stop_list_recorder(0), file_number_(0),carve_mode(CARVE_ENCODED) { + //std::cerr << "feature_recorder(" << name << ") created\n"; + open(); // open if we are created } /* Don't have to delete the stop_list_recorder because it is in the @@ -107,47 +73,100 @@ } } +void feature_recorder::banner_stamp(std::ostream &os,const std::string &header) const +{ + int banner_lines = 0; + if(banner_file.size()>0){ + std::ifstream i(banner_file.c_str()); + if(i.is_open()){ + std::string line; + while(getline(i,line)){ + if(line.size()>0 && ((*line.end()=='\r') || (*line.end()=='\n'))){ + line.erase(line.end()); /* remove the last character while it is a \n or \r */ + } + os << "# " << line << "\n"; + banner_lines++; + } + i.close(); + } + } + if(banner_lines==0){ + os << "# BANNER FILE NOT PROVIDED (-b option)\n"; + } + + os << bulk_extractor_version_header; + os << "# Feature-Recorder: " << name << "\n"; + if(fs.get_input_fname().size()) os << "# Filename: " << fs.get_input_fname() << "\n"; + if(debug!=0){ + os << "# DEBUG: " << debug << " ("; + if(debug & DEBUG_PEDANTIC) os << " DEBUG_PEDANTIC "; + os << ")\n"; + } + os << header; +} + + /** * Return the filename with a counter */ std::string feature_recorder::fname_counter(std::string suffix) const { - return outdir + "/" + this->name + (suffix.size()>0 ? (std::string("_") + suffix) : "") + ".txt"; + return fs.get_outdir() + "/" + this->name + (suffix.size()>0 ? (std::string("_") + suffix) : "") + ".txt"; } +const std::string &feature_recorder::get_outdir() const +{ + return fs.get_outdir(); +} + /** * open a feature recorder file in the specified output directory. + * Called by create_name(). Not clear why it isn't called when created. */ void feature_recorder::open() { - std::string fname = fname_counter(""); - ios.open(fname.c_str(),std::ios_base::in|std::ios_base::out|std::ios_base::ate); - if(ios.is_open()){ // opened existing stream - ios.seekg(0L,std::ios_base::end); - while(ios.is_open()){ - /* Get current position */ - if(int(ios.tellg())==0){ // at beginning of file; stamp and return - ios.seekp(0L,std::ios_base::beg); // be sure we are at the beginning of the file - return; - } - ios.seekg(-1,std::ios_base::cur); // backup to once less than the end of the file - if (ios.peek()=='\n'){ // we are finally on the \n - ios.seekg(1L,std::ios_base::cur); // move the getting one forward - ios.seekp(ios.tellg(),std::ios_base::beg); // put the putter at the getter location - count_ = 1; // greater than zero - return; + if (fs.flag_set(feature_recorder_set::SET_DISABLED)) return; // feature recorder set is disabled + + /* write to a database? Create tables if necessary and create a prepared statement */ + if (fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS)) { + char buf[1024]; + fs.db_create_table(name); + snprintf(buf,sizeof(buf),db_insert_stmt,name.c_str()); + bs = new besql_stmt(fs.db3,buf); + } + + /* Write to a file? Open the file and seek to the last line if it exist, otherwise just open database */ + if (fs.flag_notset(feature_recorder_set::DISABLE_FILE_RECORDERS)){ + /* Open the file recorder */ + std::string fname = fname_counter(""); + ios.open(fname.c_str(),std::ios_base::in|std::ios_base::out|std::ios_base::ate); + if(ios.is_open()){ // opened existing stream + ios.seekg(0L,std::ios_base::end); + while(ios.is_open()){ + /* Get current position */ + if(int(ios.tellg())==0){ // at beginning of file; stamp and return + ios.seekp(0L,std::ios_base::beg); // be sure we are at the beginning of the file + return; + } + ios.seekg(-1,std::ios_base::cur); // backup to once less than the end of the file + if (ios.peek()=='\n'){ // we are finally on the \n + ios.seekg(1L,std::ios_base::cur); // move the getting one forward + ios.seekp(ios.tellg(),std::ios_base::beg); // put the putter at the getter location + count_ = 1; // greater than zero + return; + } } } - } - // Just open the stream for output - ios.open(fname.c_str(),std::ios_base::out); - if(!ios.is_open()){ - std::cerr << "*** feature_recorder::open CANNOT OPEN FEATURE FILE FOR WRITING " - << fname << ":" << strerror(errno) << "\n"; - exit(1); + // Just open the stream for output + ios.open(fname.c_str(),std::ios_base::out); + if(!ios.is_open()){ + std::cerr << "*** feature_recorder::open CANNOT OPEN FEATURE FILE FOR WRITING " + << fname << ":" << strerror(errno) << "\n"; + exit(1); + } } } @@ -177,9 +196,24 @@ static inline int hexval(char ch) { - if(ch>='0' && ch<='9') return ch-'0'; - if(ch>='a' && ch<='f') return ch-'a'+10; - if(ch>='A' && ch<='F') return ch-'a'+10; + switch (ch) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': case 'A': return 10; + case 'b': case 'B': return 11; + case 'c': case 'C': return 12; + case 'd': case 'D': return 13; + case 'e': case 'E': return 14; + case 'f': case 'F': return 15; + } return 0; } @@ -202,7 +236,7 @@ } /* Look for hex coding */ if(i+3= limit) return -1; + return 0; + } + static int callback(void *ptr,const std::string &str,const uint64_t &tally) { + return ((mhistogram_callback *)(ptr))->do_callback(str,tally); } }; -void feature_recorder::dump_histogram(const class histogram_def &def,void *user,feature_recorder::dump_callback_t cb) -{ - if(mhistogram){ - assert(cb!=0); - mhistogram_callback mcbo(user,cb,*this); - mhistogram->dump_sorted(static_cast(&mcbo),mhistogram_callback::callback); - return; - } +/**************************************************************** + *** PHASE HISTOGRAM (formerly phase 3): Create the histograms + ****************************************************************/ - beregex reg(def.pattern,REG_EXTENDED); - std::string ifname = fname_counter(""); // source of features +/** + * We now have three kinds of histograms: + * 1 - Traditional post-processing histograms specified by the histogram library + * 1a - feature-file based traditional ones + * 1b - SQL-based traditional ones. + * 2 - In-memory histograms (used primarily by beapi) + */ + +/** Dump a specific histogram */ +void feature_recorder::dump_histogram_file(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const +{ + /* This is a file based histogram. We will be reading from one file and writing to another */ + std::string ifname = fname_counter(""); // source of features std::ifstream f(ifname.c_str()); if(!f.is_open()){ std::cerr << "Cannot open histogram input file: " << ifname << "\n"; @@ -323,7 +379,7 @@ /** If there is a pattern to use to prune down the feature, use it */ if(def.pattern.size()){ std::string new_feature = feature; - if(!reg.search(feature,&new_feature,0,0)){ + if(!def.reg.search(feature,&new_feature,0,0)){ // no search match; avoid this feature continue; } @@ -342,14 +398,14 @@ << name << "\n"; } - /* Output what we have */ + /* Output what we have to a new file ofname */ std::stringstream real_suffix; real_suffix << def.suffix; if(histogram_counter>0) real_suffix << histogram_counter; std::string ofname = fname_counter(real_suffix.str()); // histogram name std::ofstream o; - o.open(ofname.c_str()); + o.open(ofname.c_str()); // open the file if(!o.is_open()){ std::cerr << "Cannot open histogram output file: " << ofname << "\n"; return; @@ -361,6 +417,9 @@ o << *fr; // sends the entire histogram } + for(size_t i = 0;isize();i++){ + delete fr->at(i); + } delete fr; o.close(); @@ -373,39 +432,52 @@ } -void feature_recorder::add_histogram(const histogram_def &def) +void feature_recorder::dump_histogram(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const { - histogram_defs.insert(def); + /* Inform that we are dumping this histogram */ + if(cb) cb(user,*this,def,"",0); + + /* If this is a memory histogram, dump it and return */ + mhistograms_t::const_iterator it = mhistograms.find(def); + if(it!=mhistograms.end()){ + assert(cb!=0); + mhistogram_callback mcbo(user,cb,def,*this,mhistogram_limit); + it->second->dump_sorted(static_cast(&mcbo),mhistogram_callback::callback); + return; + } + + if (fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS)) { + dump_histogram_db(def,user,cb); + } + + + if (fs.flag_notset(feature_recorder_set::DISABLE_FILE_RECORDERS)) { + dump_histogram_file(def,user,cb); + } } -/* Dump all of our histograms */ + +/* Dump all of this feature recorders histograms */ void feature_recorder::dump_histograms(void *user,feature_recorder::dump_callback_t cb, - feature_recorder_set::xml_notifier_t xml_error_notifier) + feature_recorder_set::xml_notifier_t xml_error_notifier) const { - /* See if we have an in-memory histograms */ - if(flag_set(feature_recorder::FLAG_MEM_HISTOGRAM)){ - std::cerr << "***************** " << name << " has a memory histogram\n"; - histogram_def d("","","",0); // empty - dump_histogram(d,user,cb); - } - - /* Loop through all the histograms */ + /* If we are recording features to SQL and we have a histogram defintion + * for this feature recorder, we need to create a base histogram first, + * then we can create the extracted histograms if they are presented. + */ + + + /* Loop through all the histograms and dump each one. + * This now works for both memory histograms and non-memory histograms. + */ for(histogram_defs_t::const_iterator it = histogram_defs.begin();it!=histogram_defs.end();it++){ - std::cout << std::string(" ") << name << " " << (*it).suffix + "...\n"; - std::cout.flush(); try { - if(flag_set(feature_recorder::FLAG_MEM_HISTOGRAM)){ - std::cerr << name << " cannot have both a regular histogram and a memory histogram\n"; - } else { - dump_histogram((*it),user,cb); - } + dump_histogram((*it),user,cb); } catch (const std::exception &e) { - std::cerr << "ERROR: " ; - std::cerr.flush(); - std::cerr << e.what() << " computing histogram " << name << "\n"; + std::cerr << "ERROR: histogram " << name << ": " << e.what() << "\n"; if(xml_error_notifier){ std::string error = std::string("0)) ss << '\t' << context; - this->write(ss.str()); + if ( fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS ) && + this->flag_notset(feature_recorder::FLAG_NO_FEATURES_SQL) ) { + db_write0( pos0, feature, context); + } + if ( fs.flag_notset(feature_recorder_set::DISABLE_FILE_RECORDERS )) { + std::stringstream ss; + ss << pos0.shift( feature_recorder::offset_add).str() << '\t' << feature; + if (flag_notset( FLAG_NO_CONTEXT ) && ( context.size()>0 )) ss << '\t' << context; + this->write( ss.str() ); + } } @@ -484,20 +578,8 @@ * processes the stop list */ -void feature_recorder::write(const pos0_t &pos0,const std::string &feature_,const std::string &context_) +void feature_recorder::quote_if_necessary(std::string &feature,std::string &context) { - if(flags & FLAG_DISABLED) return; // disabled - if(debug & DEBUG_PEDANTIC){ - if(feature_.size() > opt_max_feature_size){ - std::cerr << "feature_recorder::write : feature_.size()=" << feature_.size() << "\n"; - assert(0); - } - if(context_.size() > opt_max_context_size){ - std::cerr << "feature_recorder::write : context_.size()=" << context_.size() << "\n"; - assert(0); - } - } - /* By default quote string that is not UTF-8, and quote backslashes. */ bool escape_bad_utf8 = true; bool escape_backslash = true; @@ -512,15 +594,38 @@ escape_backslash = false; } - std::string feature = validateOrEscapeUTF8(feature_, escape_bad_utf8,escape_backslash); - - std::string context; + feature = validateOrEscapeUTF8(feature, escape_bad_utf8,escape_backslash); + if(feature.size() > opt_max_feature_size) feature.resize(opt_max_feature_size); if(flag_notset(FLAG_NO_CONTEXT)){ - context = validateOrEscapeUTF8(context_,escape_bad_utf8,escape_backslash); + context = validateOrEscapeUTF8(context,escape_bad_utf8,escape_backslash); + if(context.size() > opt_max_context_size) context.resize(opt_max_context_size); } +} + +/** + * write() is the main entry point for writing a feature at a given position with context. + * write() checks the stoplist and escapes non-UTF8 characters, then calls write0(). + */ +void feature_recorder::write(const pos0_t &pos0,const std::string &feature_,const std::string &context_) +{ + if(flags & FLAG_DISABLED) return; // disabled + if(debug & DEBUG_PEDANTIC){ + if(feature_.size() > opt_max_feature_size){ + std::cerr << "feature_recorder::write : feature_.size()=" << feature_.size() << "\n"; + assert(0); + } + if(context_.size() > opt_max_context_size){ + std::cerr << "feature_recorder::write : context_.size()=" << context_.size() << "\n"; + assert(0); + } + } + + std::string feature = feature_; + std::string context = flag_set(FLAG_NO_CONTEXT) ? "" : context_; + std::string *feature_utf8 = HistogramMaker::make_utf8(feature); // a utf8 feature + + quote_if_necessary(feature,context); - if(feature.size() > opt_max_feature_size) feature.resize(opt_max_feature_size); - if(context.size() > opt_max_context_size) context.resize(opt_max_context_size); if(feature.size()==0){ std::cerr << "zero length feature at " << pos0 << "\n"; if(debug & DEBUG_PEDANTIC) assert(0); @@ -546,8 +651,9 @@ */ if(flag_notset(FLAG_NO_STOPLIST) && stop_list_recorder){ if(fs.stop_list - && fs.stop_list->check_feature_context(feature,context)){ + && fs.stop_list->check_feature_context(*feature_utf8,context)){ stop_list_recorder->write(pos0,feature,context); + delete feature_utf8; return; } } @@ -557,10 +663,9 @@ */ if(flag_notset(FLAG_NO_ALERTLIST) && fs.alert_list - && fs.alert_list->check_feature_context(feature,context)){ - std::string alert_fn = outdir + "/ALERTS_found.txt"; - - cppmutex::lock lock(Mr); // notce we are locking the redlist + && fs.alert_list->check_feature_context(*feature_utf8,context)){ + std::string alert_fn = fs.get_outdir() + "/ALERTS_found.txt"; + cppmutex::lock lock(Mr); // notice we are locking the alert list std::ofstream rf(alert_fn.c_str(),std::ios_base::app); if(rf.is_open()){ rf << pos0.shift(feature_recorder::offset_add).str() << '\t' << feature << '\t' << "\n"; @@ -568,14 +673,27 @@ } /* Support in-memory histograms */ - if(mhistogram){ - mhistogram->add(feature,1); + for(mhistograms_t::iterator it = mhistograms.begin(); it!=mhistograms.end();it++){ + const histogram_def &def = it->first; + mhistogram_t *m = it->second; + std::string new_feature = *feature_utf8; + if(def.require.size()==0 || new_feature.find_first_of(def.require)!=std::string::npos){ + /* If there is a pattern to use, use it */ + if(def.pattern.size()){ + if(!def.reg.search(new_feature,&new_feature,0,0)){ + // no search match; avoid this feature + new_feature = ""; + } + } + if(new_feature.size()) m->add(new_feature,1); + } } /* Finally write out the feature and the context */ if(flag_notset(FLAG_NO_FEATURES)){ this->write0(pos0,feature,context); } + delete feature_utf8; } /** @@ -653,6 +771,7 @@ **************************************************************** * * Carving support. + * 2014-04-24 - $ is no longer valid either * 2013-08-29 - replace invalid characters in filenames * 2013-07-30 - automatically bin directories * 2013-06-08 - filenames are the forensic path. @@ -667,7 +786,7 @@ || ch=='"' || ch=='*' || ch=='+' || ch==',' || ch=='/' || ch==':' || ch==';' || ch=='<' || ch=='=' || ch=='>' || ch=='?' || ch=='\\' - || ch=='[' || ch==']' || ch=='|'){ + || ch=='[' || ch==']' || ch=='|' || ch=='$' ){ out.push_back('_'); } else { out.push_back(ch); @@ -677,17 +796,22 @@ } +//const feature_recorder::hash_def &feature_recorder::hasher() +//{ +// return fs.hasher; +//} + + + #include /** * @param sbuf - the buffer to carve * @param pos - offset in the buffer to carve * @param len - how many bytes to carve - * @param hasher - to compute the hash of the carved object. * */ std::string feature_recorder::carve(const sbuf_t &sbuf,size_t pos,size_t len, - const std::string &ext, - const be13::hash_def &hasher) + const std::string &ext) { if(flags & FLAG_DISABLED) return std::string(); // disabled @@ -695,11 +819,7 @@ if(pos >= sbuf.pagesize && pos < sbuf.bufsize){ return std::string(); } - - if(pos >= sbuf.bufsize){ /* Sanity checks */ - std::cerr << "*** carve: WRITE OUTSIDE BUFFER. pos=" << pos << " sbuf=" << sbuf << "\n"; - return std::string(); - } + assert(pos < sbuf.bufsize); /* Carve to a file depending on the carving mode. The purpose * of CARVE_ENCODED is to allow us to carve JPEGs when they are @@ -734,21 +854,22 @@ */ uint64_t this_file_number = file_number_add(1); - std::string dirname1 = outdir + "/" + name; + std::string dirname1 = fs.get_outdir() + "/" + name; std::stringstream ss; ss << dirname1 << "/" << std::setw(3) << std::setfill('0') << (this_file_number / 1000); std::string dirname2 = ss.str(); - std::string fname = dirname2 + std::string("/") + valid_dosname(sbuf.pos0.str() + ext); - std::string carved_hash_hexvalue = (*hasher.func)(sbuf.buf,sbuf.bufsize); + std::string fname = dirname2 + std::string("/") + valid_dosname(sbuf.pos0.str() + ext); + std::string fname_feature = fname.substr(fs.get_outdir().size()+1); + std::string carved_hash_hexvalue = (*fs.hasher.func)(sbuf.buf,sbuf.bufsize); /* Record what was found in the feature file. */ ss.str(std::string()); // clear the stringstream ss << "" << fname << "" << len << "" - << "" << carved_hash_hexvalue << ""; - this->write(sbuf.pos0+len,fname,ss.str()); + << "" << carved_hash_hexvalue << ""; + this->write(sbuf.pos0+len,fname_feature,ss.str()); /* Make the directory if it doesn't exist. */ if (access(dirname2.c_str(),R_OK)!=0){ diff -Nru tcpflow-1.4.4+repack1/src/be13_api/feature_recorder.h tcpflow-1.4.5+repack1/src/be13_api/feature_recorder.h --- tcpflow-1.4.4+repack1/src/be13_api/feature_recorder.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/feature_recorder.h 2015-08-26 03:35:59.000000000 +0000 @@ -37,18 +37,37 @@ #include #include #include +#include #include - #include +#ifdef HAVE_SQLITE3_H +#include +#ifndef BEAPI_SQLITE +# define BEAPI_SQLITE3 sqlite3 +# define BEAPI_SQLITE3_STMT sqlite3_stmt +#endif +#endif + +#ifndef BEAPI_SQLITE3 +#define BEAPI_SQLITE3 void +#define BEAPI_SQLITE3_STMT void +#endif + + #include "cppmutex.h" #include "dfxml/src/dfxml_writer.h" #include "dfxml/src/hash_t.h" #include "atomic_set_map.h" +#include "beregex.h" -/* histogram_def should be within the feature_recorder_set class. Oh well. */ -class histogram_def { - public: +/** + * histogram_def defines the histograms that will be made by a feature recorder. + * If the mhistogram is set, the histogram is generated when features are recorded + * and kept in memory. If mhistogram is not set, the histogram is generated when the feature recorder is closed. + */ + +struct histogram_def { /** * @param feature- the feature file to histogram (no .txt) * @param re - the regular expression to extract @@ -58,19 +77,27 @@ */ histogram_def(std::string feature_,std::string re_,std::string suffix_,uint32_t flags_=0): - feature(feature_),pattern(re_),require(),suffix(suffix_),flags(flags_){} + feature(feature_),pattern(re_),require(),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){} histogram_def(std::string feature_,std::string re_,std::string require_,std::string suffix_,uint32_t flags_=0): - feature(feature_),pattern(re_),require(require_),suffix(suffix_),flags(flags_){ } - std::string feature; /* feature file */ - std::string pattern; /* extract pattern; "" means use entire feature */ - std::string require; /* text required somewhere on the feature line; used for IP histograms */ - std::string suffix; /* suffix to append; "" means "histogram" */ - uint32_t flags; // defined in histogram.h + feature(feature_),pattern(re_),require(require_),suffix(suffix_),flags(flags_),reg(pattern,REG_EXTENDED){ } + const std::string feature; /* feature file name */ + const std::string pattern; /* extract pattern; "" means use entire feature */ + const std::string require; /* text required somewhere on the feature line; used for IP histograms */ + const std::string suffix; /* suffix to append; "" means "histogram" */ + const uint32_t flags; // defined in histogram.h + const beregex reg; // regular expression for pattern }; +/* NOTE: + * 1 - This typedef must remain outside the the feature_recorder due + * to historical reasons and cannot be made a vector + * 2 - Do not make historam_def const! It breaks some compilers. + */ + typedef std::set histogram_defs_t; // a set of histogram definitions -inline bool operator <(class histogram_def h1,class histogram_def h2) { + +inline bool operator <(const histogram_def &h1,const histogram_def &h2) { if (h1.featureh2.feature) return false; if (h1.pattern mhistogram_t; // memory histogram +typedef std::map mhistograms_t; + + class feature_recorder { // default copy construction and assignment are meaningless // and not implemented @@ -98,9 +130,20 @@ /****************************************************************/ public: - typedef atomic_histogram mhistogram_t; // memory histogram - typedef void (dump_callback_t)(void *,const feature_recorder &fr, - const std::string &feature,const uint64_t &count); + class besql_stmt { + besql_stmt(const besql_stmt &); + besql_stmt &operator=(const besql_stmt &); +public: + cppmutex Mstmt; // a mutext to protect it + BEAPI_SQLITE3_STMT *stmt; // the prepared statement + besql_stmt(BEAPI_SQLITE3 *db3,const char *sql); + virtual ~besql_stmt(); + void insert_feature(const pos0_t &pos, // insert it into this table! + const std::string &feature,const std::string &feature8, const std::string &context); + }; + + typedef int (dump_callback_t)(void *user,const feature_recorder &fr,const histogram_def &def, + const std::string &feature,const uint64_t &count); static void set_main_threadid(){ #ifndef WIN32 main_threadid=pthread_self(); @@ -115,28 +158,28 @@ * These flags control scanners. Set them with set_flag(). */ /** Disable this recorder. */ - static const int FLAG_DISABLED=0x01; // Disabled - static const int FLAG_NO_CONTEXT=0x02; // Do not write context. - static const int FLAG_NO_STOPLIST=0x04; // Do not honor the stoplist/alertlist. - static const int FLAG_NO_ALERTLIST=0x04; // Do not honor the stoplist/alertlist. + static const int FLAG_DISABLED = 0x01; // feature recorder is Disabled + static const int FLAG_NO_CONTEXT = 0x02; // Do not write context. + static const int FLAG_NO_STOPLIST = 0x04; // Do not honor the stoplist/alertlist. + static const int FLAG_NO_ALERTLIST = 0x08; // Do not honor the stoplist/alertlist. /** * Normally feature recorders automatically quote non-UTF8 characters * with \x00 notation and quote "\" as \x5C. Specify FLAG_NO_QUOTE to * disable this behavior. */ - static const int FLAG_NO_QUOTE=0x08; // do not escape UTF8 codes + static const int FLAG_NO_QUOTE = 0x10; // do not escape UTF8 codes /** * Use this flag the feature recorder is sending UTF-8 XML. * non-UTF8 will be quoted but "\" will not be escaped. */ - static const int FLAG_XML = 0x10; // will be sending XML + static const int FLAG_XML = 0x20; // will be sending XML /** * histogram support. */ - static const int FLAG_MEM_HISTOGRAM = 0x20; // enable the in-memory histogram - static const int FLAG_NO_FEATURES = 0x40; // do not record features (just histogram) + static const uint32_t FLAG_NO_FEATURES = 0x40; // do not record features (just memory histogram) + static const uint32_t FLAG_NO_FEATURES_SQL = 0x80; // do not write features to SQL /** @} */ static const int max_histogram_files = 10; // don't make more than 10 files in low-memory conditions @@ -148,39 +191,43 @@ static uint32_t opt_max_context_size; static uint32_t opt_max_feature_size; static int64_t offset_add; // added to every reported offset, for use with hadoop - static std::string banner_file; // banner for top of every file + static std::string banner_file; // banner for top of every file static std::string extract_feature(const std::string &line); feature_recorder(class feature_recorder_set &fs, - const std::string &outdir, - const std::string &input_fname,const std::string &name); - virtual ~feature_recorder(); - virtual void set_flag(uint32_t flags_); - virtual void unset_flag(uint32_t flags_); - bool flag_set(uint32_t f) const {return flags & f;} - bool flag_notset(uint32_t f) const {return !(flags & f);} - uint32_t get_flags() const {return flags;} + const std::string &name); + virtual ~feature_recorder(); + virtual void set_flag(uint32_t flags_); + virtual void unset_flag(uint32_t flags_); + void enable_memory_histograms(); // only called from feature_recorder_set + virtual void set_memhist_limit(int64_t limit_); + bool flag_set(uint32_t f) const {return flags & f;} + bool flag_notset(uint32_t f) const {return !(flags & f);} + uint32_t get_flags() const {return flags;} + virtual const std::string &get_outdir() const; static size_t context_window_default; // global option - const std::string outdir; // where output goes (could be static, I guess - const std::string input_fname; // image we are analyzing const std::string name; // name of this feature recorder private: std::string ignore_encoding; // encoding to ignore for carving std::fstream ios; // where features are written + + class besql_stmt *bs; // prepared beapi sql statement protected:; - histogram_defs_t histogram_defs; // histograms that are to be created for this feature recorder - class feature_recorder_set &fs; // the set in which this feature_recorder resides + histogram_defs_t histogram_defs; // histograms that are to be created for this feature recorder +public: + class feature_recorder_set &fs; // the set in which this feature_recorder resides +protected: int64_t count_; /* number of records written */ size_t context_window_before; // context window size_t context_window_after; // context window mutable cppmutex Mf; // protects the file mutable cppmutex Mr; // protects the redlist -protected: - mhistogram_t *mhistogram; // if we are building an in-memory-histogram + mhistograms_t mhistograms; // the memory histograms, if we are using them + uint64_t mhistogram_limit; // how many we want (per feature recorder limit, rather than per histogram) class feature_recorder *stop_list_recorder; // where stopped features get written int64_t file_number_; /* starts at 0; gets incremented by carve(); for binning */ @@ -210,26 +257,32 @@ #endif } - void banner_stamp(std::ostream &os,const std::string &header); // stamp banner, and header + void banner_stamp(std::ostream &os,const std::string &header) const; // stamp banner, and header /* where stopped items (on stop_list or context_stop_list) get recorded: */ std::string fname_counter(std::string suffix) const; static std::string quote_string(const std::string &feature); // turns unprintable characters to octal escape static std::string unquote_string(const std::string &feature); // turns octal escape back to binary characters + //virtual const feature_recorder_set::hash_def &hasher(); // returns hasher in feature_recorder_set + /* feature file management */ virtual void open(); virtual void close(); virtual void flush(); - static void dump_callback_test(void *user,const feature_recorder &fr, + static int dump_callback_test(void *user,const feature_recorder &fr, const std::string &str,const uint64_t &count); // test callback for you to use! + + /* TK: The histogram_def should be provided at the beginning, so it can be used for in-memory histograms. * The callback needs to have the specific atomic set as the callback as well. */ - virtual void add_histogram(const class histogram_def &def); // adds a histogram to process - virtual void dump_histogram(const class histogram_def &def,void *user,feature_recorder::dump_callback_t cb); + virtual void add_histogram(const histogram_def &def); // adds a histogram to process + virtual void dump_histogram_file(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const; + virtual void dump_histogram_db(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const; + virtual void dump_histogram(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const; typedef void (*xml_notifier_t)(const std::string &xmlstring); - virtual void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier); + virtual void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const; /* Methods to get info */ uint64_t count() const {return count_;} @@ -241,6 +294,7 @@ * pos0 gives the location and prefix for the beginning of the buffer */ + /**************************************************************** *** External entry points. ****************************************************************/ @@ -256,13 +310,20 @@ * support for writing features */ + void quote_if_necessary(std::string &feature,std::string &context); + // only virtual functions may be called by plug-ins // printf() prints to the feature file. virtual void printf(const char *fmt_,...) __attribute__((format(printf, 2, 3))); // // write a feature and its context; the feature may be in the context, but doesn't need to be. // write() calls write0() after histogram, quoting, and stoplist processing + // write0() calls write0_sql() if sqlwriting is enabled virtual void write0(const pos0_t &pos0,const std::string &feature,const std::string &context); +private: + virtual void db_write0(const pos0_t &pos0,const std::string &feature,const std::string &context); + static const char *db_insert_stmt; +public: // write a feature and its context; the feature may be in the context, but doesn't need to be. // entries processed by write below will be processed by histogram system @@ -288,8 +349,7 @@ // Carve a file; returns filename of carved file or empty string if nothing carved virtual std::string carve(const sbuf_t &sbuf,size_t pos,size_t len, - const std::string &ext, // appended to forensic path - const struct be13::hash_def &hasher); + const std::string &ext); // appended to forensic path // Set the time of the carved file to iso8601 file virtual void set_carve_mtime(const std::string &fname, const std::string &mtime_iso8601); }; diff -Nru tcpflow-1.4.4+repack1/src/be13_api/feature_recorder_set.cpp tcpflow-1.4.5+repack1/src/be13_api/feature_recorder_set.cpp --- tcpflow-1.4.4+repack1/src/be13_api/feature_recorder_set.cpp 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/feature_recorder_set.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -2,26 +2,37 @@ #include "config.h" #include "bulk_extractor_i.h" - #include "histogram.h" - /**************************************************************** - *** feature_recorder_set - *** No mutex is needed for the feature_recorder_set because it is never - *** modified after it is created, only the contained feature_recorders are modified. + *** feature_recorder_set: + *** Manage the set of feature recorders. + *** Handles both file-based feature recorders and the SQLite3 feature recorder. ****************************************************************/ const std::string feature_recorder_set::ALERT_RECORDER_NAME = "alerts"; const std::string feature_recorder_set::DISABLED_RECORDER_NAME = "disabled"; +const std::string feature_recorder_set::NO_INPUT = ""; +const std::string feature_recorder_set::NO_OUTDIR = ""; + +static std::string null_hasher_name("null"); +static std::string null_hasher_func(const uint8_t *buf,size_t bufsize) +{ + return std::string("0000000000000000"); +} -/* Create an empty recorder */ -feature_recorder_set::feature_recorder_set(uint32_t flags_):flags(flags_),seen_set(),input_fname(), - outdir(), - frm(),map_lock(), - histogram_defs(), - alert_list(),stop_list(), - scanner_stats() +feature_recorder_set::hash_def feature_recorder_set::null_hasher(null_hasher_name,null_hasher_func); + +/* Create an empty recorder with no outdir. */ +feature_recorder_set::feature_recorder_set(uint32_t flags_,const feature_recorder_set::hash_def &hasher_, + const std::string &input_fname_,const std::string &outdir_): + flags(flags_),seen_set(),input_fname(input_fname_), + outdir(outdir_), + frm(),Mscanner_stats(), + histogram_defs(), + Min_transaction(),in_transaction(),db3(), + alert_list(),stop_list(), + scanner_stats(),hasher(hasher_) { if(flags & SET_DISABLED){ create_name(DISABLED_RECORDER_NAME,false); @@ -30,17 +41,23 @@ } /** - * Create a properly functioning feature recorder set. + * Initialize a properly functioning feature recorder set. * If disabled, create a disabled feature_recorder that can respond to functions as requested. */ -void feature_recorder_set::init(const feature_file_names_t &feature_files, - const std::string &input_fname_, - const std::string &outdir_) +void feature_recorder_set::init(const feature_file_names_t &feature_files) { - input_fname = input_fname_; - outdir = outdir_; + /* Make sure we can write to the outdir if one is provided */ + if ((outdir != NO_OUTDIR) && (access(outdir.c_str(),W_OK)!=0)) { + throw new std::invalid_argument("output directory not writable"); + } + + if (flag_set(ENABLE_SQLITE3_RECORDERS)) { + db_create(); + } - create_name(feature_recorder_set::ALERT_RECORDER_NAME,false); // make the alert recorder + if (flag_notset(NO_ALERT)) { + create_name(feature_recorder_set::ALERT_RECORDER_NAME,false); // make the alert recorder + } /* Create the requested feature files */ for(std::set::const_iterator it=feature_files.begin();it!=feature_files.end();it++){ @@ -48,6 +65,9 @@ } } +/** Flush all of the feature recorder files. + * Typically done at the end of an sbuf. + */ void feature_recorder_set::flush_all() { for(feature_recorder_map::iterator i = frm.begin();i!=frm.end();i++){ @@ -60,6 +80,9 @@ for(feature_recorder_map::iterator i = frm.begin();i!=frm.end();i++){ i->second->close(); } + if ( flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS )) { + db_transaction_commit(); + } } @@ -71,7 +94,7 @@ /* * Gets a feature_recorder_set. */ -feature_recorder *feature_recorder_set::get_name(const std::string &name) +feature_recorder *feature_recorder_set::get_name(const std::string &name) const { const std::string *thename = &name; if(flags & SET_DISABLED){ // if feature recorder set is disabled, return the disabled recorder. @@ -82,46 +105,45 @@ thename = &feature_recorder_set::ALERT_RECORDER_NAME; } - cppmutex::lock lock(map_lock); + cppmutex::lock lock(Mscanner_stats); feature_recorder_map::const_iterator it = frm.find(*thename); if(it!=frm.end()) return it->second; return(0); // feature recorder does not exist } -feature_recorder *feature_recorder_set::create_name_factory(const std::string &outdir_,const std::string &input_fname_,const std::string &name_){ - return new feature_recorder(*this,outdir_,input_fname_,name_); +feature_recorder *feature_recorder_set::create_name_factory(const std::string &name_) +{ + return new feature_recorder(*this,name_); } -void feature_recorder_set::create_name(const std::string &name,bool create_stop_file) +/* + * Create a named feature recorder, any associated stoplist recorders, and open the files + */ +void feature_recorder_set::create_name(const std::string &name,bool create_stop_recorder) { if(frm.find(name)!=frm.end()){ std::cerr << "create_name: feature recorder '" << name << "' already exists\n"; return; } - feature_recorder *fr = create_name_factory(outdir,input_fname,name); - feature_recorder *fr_stopped = 0; + feature_recorder *fr = create_name_factory(name); frm[name] = fr; - if(create_stop_file){ + if (create_stop_recorder){ std::string name_stopped = name+"_stopped"; - fr_stopped = create_name_factory(outdir,input_fname,name_stopped); + feature_recorder *fr_stopped = create_name_factory(name_stopped); fr->set_stop_list_recorder(fr_stopped); frm[name_stopped] = fr_stopped; } - - if(flags & SET_DISABLED) return; // don't open if we are disabled - - /* Open the output!*/ - fr->open(); - if(fr_stopped) fr_stopped->open(); } -feature_recorder *feature_recorder_set::get_alert_recorder() +feature_recorder *feature_recorder_set::get_alert_recorder() const { + if (flag_set(NO_ALERT)) return 0; + return get_name(feature_recorder_set::ALERT_RECORDER_NAME); } @@ -137,22 +159,27 @@ void feature_recorder_set::add_stats(const std::string &bucket,double seconds) { - cppmutex::lock lock(map_lock); + cppmutex::lock lock(Mscanner_stats); struct pstats &p = scanner_stats[bucket]; // get the location of the stats p.seconds += seconds; p.calls ++; } -void feature_recorder_set::get_stats(void *user,stat_callback_t stat_callback) +/* + * Send the stats to a callback; if the callback returns less than 0, abort. + */ +void feature_recorder_set::get_stats(void *user,stat_callback_t stat_callback) const { for(scanner_stats_map::const_iterator it = scanner_stats.begin();it!=scanner_stats.end();it++){ - (*stat_callback)(user,(*it).first,(*it).second.calls,(*it).second.seconds); + if((*stat_callback)(user,(*it).first,(*it).second.calls,(*it).second.seconds)<0){ + break; + } } } -void feature_recorder_set::dump_name_count_stats(dfxml_writer &writer) +void feature_recorder_set::dump_name_count_stats(dfxml_writer &writer) const { - cppmutex::lock lock(map_lock); + cppmutex::lock lock(Mscanner_stats); writer.push("feature_files"); for(feature_recorder_map::const_iterator ij = frm.begin(); ij != frm.end(); ij++){ writer.set_oneline(true); @@ -165,7 +192,44 @@ } -static const int LINE_LEN = 80; // keep track of where we are on the line +void feature_recorder_set::set_flag(uint32_t f) +{ + if(f & MEM_HISTOGRAM){ + if(flags & MEM_HISTOGRAM){ + std::cerr << "MEM_HISTOGRAM flag cannot be set twice\n"; + assert(0); + } + /* Create the in-memory histograms for all of the feature recorders */ + for(feature_recorder_map::const_iterator it = frm.begin(); it!=frm.end(); it++){ + feature_recorder *fr = it->second; + fr->enable_memory_histograms(); + } + } + flags |= f; +} + +void feature_recorder_set::unset_flag(uint32_t f) +{ + if(f & MEM_HISTOGRAM){ + std::cerr << "MEM_HISTOGRAM flag cannot be cleared\n"; + assert(0); + } + flags &= ~f; +} + +/**************************************************************** + *** PHASE HISTOGRAM (formerly phase 3): Create the histograms + ****************************************************************/ + +/** + * We now have three kinds of histograms: + * 1 - Traditional post-processing histograms specified by the histogram library + 1a - feature-file based traditional ones + 1b - SQL-based traditional ones. + * 2 - In-memory histograms (used primarily by beapi) + */ + + void feature_recorder_set::add_histogram(const histogram_def &def) { feature_recorder *fr = get_name(def.feature); @@ -173,7 +237,7 @@ } void feature_recorder_set::dump_histograms(void *user,feature_recorder::dump_callback_t cb, - feature_recorder_set::xml_notifier_t xml_error_notifier) + feature_recorder_set::xml_notifier_t xml_error_notifier) const { /* Ask each feature recorder to dump its histograms */ for(feature_recorder_map::const_iterator it = frm.begin(); it!=frm.end(); it++){ @@ -182,3 +246,9 @@ } } +void feature_recorder_set::get_feature_file_list(std::vector &ret) +{ + for(feature_recorder_map::const_iterator it = frm.begin(); it!=frm.end(); it++){ + ret.push_back(it->first); + } +} diff -Nru tcpflow-1.4.4+repack1/src/be13_api/feature_recorder_set.h tcpflow-1.4.5+repack1/src/be13_api/feature_recorder_set.h --- tcpflow-1.4.4+repack1/src/be13_api/feature_recorder_set.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/feature_recorder_set.h 2015-08-26 03:35:59.000000000 +0000 @@ -17,9 +17,10 @@ /** * \class feature_recorder_set - * A singleton class that holds a set of recorders. - * This used to be done with a set, but now it's done with a map. - * + * The feature_recorder_set is an object that controls output. It knows where the output goes (outdir), + * the various feature recorders that write to that output, and provides for synchronization. + * It also has the factory method for new feature_recorders. Therefore if you want a different feature_recorder, + * this set should be subclassed as well. */ typedef std::map feature_recorder_map; @@ -30,42 +31,65 @@ feature_recorder_set &operator=(const feature_recorder_set &fs); uint32_t flags; atomic_set seen_set; // hex hash values of pages that have been seen - std::string input_fname; // input file - std::string outdir; // where output goes - feature_recorder_map frm; // map of feature recorders, by name - cppmutex map_lock; // locks frm and scanner_stats_map + const std::string input_fname; // input file + const std::string outdir; // where output goes + feature_recorder_map frm; // map of feature recorders, by name; TK-replace with an atomic_set + mutable cppmutex Mscanner_stats; // locks frm and scanner_stats_map histogram_defs_t histogram_defs; // histograms that are to be created. + mutable cppmutex Min_transaction; + bool in_transaction; public: + BEAPI_SQLITE3 *db3; // opened in SQLITE_OPEN_FULLMUTEX mode + virtual void heartbeat(){}; // called at a regular basis + struct hash_def { + hash_def(std::string name_,std::string (*func_)(const uint8_t *buf,const size_t bufsize)):name(name_),func(func_){}; + std::string name; // name of hash + std::string (*func)(const uint8_t *buf,const size_t bufsize); // hash function + }; struct pstats { double seconds; uint64_t calls; }; + /** create an emptry feature recorder set. If disabled, create a disabled recorder. */ + feature_recorder_set(uint32_t flags_,const hash_def &hasher_, + const std::string &input_fname_,const std::string &outdir_); + typedef std::map scanner_stats_map; const word_and_context_list *alert_list; /* shold be flagged */ const word_and_context_list *stop_list; /* should be ignored */ - scanner_stats_map scanner_stats; + scanner_stats_map scanner_stats; + + const hash_def &hasher; // function for hashing; specified at creation + static hash_def null_hasher; // a default hasher available for all to use (it doesn't hash) + static const std::string ALERT_RECORDER_NAME; // the name of the alert recorder static const std::string DISABLED_RECORDER_NAME; // the fake disabled feature recorder + static const std::string NO_INPUT; // 'filename' indicator that the FRS has no input file + static const std::string NO_OUTDIR; // 'dirname' indicator that the FRS produces no file output + /* flags */ - static const uint32_t ONLY_ALERT=0x01; // always return the alert recorder - static const uint32_t SET_DISABLED=0x02; // the set is effectively disabled; for path-printer - static const uint32_t CREATE_STOP_LIST_RECORDERS=0x04; // + static const uint32_t ONLY_ALERT = 0x01; // always return the alert recorder + static const uint32_t SET_DISABLED = 0x02; // the set is effectively disabled; for path-printer + static const uint32_t CREATE_STOP_LIST_RECORDERS= 0x04; // + static const uint32_t MEM_HISTOGRAM = 0x20; // enable the in-memory histogram + static const uint32_t ENABLE_SQLITE3_RECORDERS = 0x40; // save features to an SQLITE3 databse + static const uint32_t DISABLE_FILE_RECORDERS = 0x80; // do not save features to file-based recorders + static const uint32_t NO_ALERT = 0x100; // no alert recorder virtual ~feature_recorder_set() { for(feature_recorder_map::iterator i = frm.begin();i!=frm.end();i++){ delete i->second; } + db_close(); } - std::string get_input_fname() const {return input_fname;} - std::string get_outdir() const {return outdir;} + std::string get_input_fname() const {return input_fname;} + virtual const std::string &get_outdir() const { return outdir;} void set_stop_list(const word_and_context_list *alist){stop_list=alist;} void set_alert_list(const word_and_context_list *alist){alert_list=alist;} - /** create an emptry feature recorder set. If disabled, create a disabled recorder. */ - feature_recorder_set(uint32_t flags_); /** Initialize a feature_recorder_set. Previously this was a constructor, but it turns out that * virtual functions for the create_name_factory aren't honored in constructors. @@ -74,35 +98,56 @@ * tells each feature file about its histograms (among other * things) */ - void init(const feature_file_names_t &feature_files, - const std::string &input_fname,const std::string &outdir); + void init(const feature_file_names_t &feature_files); void flush_all(); void close_all(); bool has_name(std::string name) const; /* does the named feature exist? */ - void set_flag(uint32_t f){flags|=f;} - void clear_flag(uint32_t f){flags|=f;} - void add_histogram(const histogram_def &def); // adds it to a local set or to the specific feature recorder + /* flags */ + void set_flag(uint32_t f); + void unset_flag(uint32_t f); + bool flag_set(uint32_t f) const {return flags & f;} + bool flag_notset(uint32_t f) const {return !(flags & f);} + uint32_t get_flags() const {return flags;} + typedef void (*xml_notifier_t)(const std::string &xmlstring); - void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier); - virtual feature_recorder *create_name_factory(const std::string &outdir_, - const std::string &input_fname_,const std::string &name_); + void add_histogram(const histogram_def &def); // adds it to a local set or to the specific feature recorder + void dump_histograms(void *user,feature_recorder::dump_callback_t cb, xml_notifier_t xml_error_notifier) const; + virtual feature_recorder *create_name_factory(const std::string &name_); virtual void create_name(const std::string &name,bool create_stop_also); - virtual const std::string &get_outdir(){ return outdir;} void add_stats(const std::string &bucket,double seconds); - typedef void (*stat_callback_t)(void *user,const std::string &name,uint64_t calls,double seconds); - void get_stats(void *user,stat_callback_t stat_callback); - void dump_name_count_stats(dfxml_writer &writer); + typedef int (*stat_callback_t)(void *user,const std::string &name,uint64_t calls,double seconds); + void get_stats(void *user,stat_callback_t stat_callback) const; + void dump_name_count_stats(dfxml_writer &writer) const; + + /**************************************************************** + *** SQLite3 interface + ****************************************************************/ + + + virtual void db_send_sql(BEAPI_SQLITE3 *db3,const char **stmts, ...) ; + virtual BEAPI_SQLITE3 *db_create_empty(const std::string &name) ; + void db_create_table(const std::string &name) ; + void db_create() ; + void db_transaction_begin() ; + void db_transaction_commit() ; // commit current transaction + void db_close() ; // + + /**************************************************************** + *** External Functions + ****************************************************************/ + // Management of previously seen data virtual bool check_previously_processed(const uint8_t *buf,size_t bufsize); // NOTE: // only virtual functions may be called by plugins! - virtual feature_recorder *get_name(const std::string &name); - virtual feature_recorder *get_alert_recorder(); + virtual feature_recorder *get_name(const std::string &name) const; + virtual feature_recorder *get_alert_recorder() const; + virtual void get_feature_file_list(std::vector &ret); // clears ret and fills with a list of feature file names }; diff -Nru tcpflow-1.4.4+repack1/src/be13_api/feature_recorder_sql.cpp tcpflow-1.4.5+repack1/src/be13_api/feature_recorder_sql.cpp --- tcpflow-1.4.4+repack1/src/be13_api/feature_recorder_sql.cpp 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/feature_recorder_sql.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,374 @@ +/* + * Feature recorder mods for writing features into an SQLite3 database. + */ + +/* http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/ */ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include "bulk_extractor_i.h" +#include "histogram.h" + +/* + * Time results with ubnist1 on R4: + * no SQL - 79 seconds + * no pragmas - 651 seconds + * "PRAGMA synchronous = OFF", - 146 second + * "PRAGMA synchronous = OFF", "PRAGMA journal_mode=MEMORY", - 79 seconds + * + * Time with domexusers: + * no SQL - + */ + + +#if defined(HAVE_LIBSQLITE3) && defined(HAVE_SQLITE3_H) +#define USE_SQLITE3 +#endif +#define SQLITE_EXTENSION ".sqlite" + +#ifndef SQLITE_DETERMINISTIC +#define SQLITE_DETERMINISTIC 0 +#endif + +static int debug = 0; + +#ifdef USE_SQLITE3 +static const char *schema_db[] = { + "PRAGMA synchronous = OFF", + "PRAGMA journal_mode=MEMORY", + //"PRAGMA temp_store=MEMORY", // did not improve performance + "PRAGMA cache_size = 200000", + "CREATE TABLE IF NOT EXISTS db_info (schema_ver INTEGER, bulk_extractor_ver INTEGER)", + "INSERT INTO db_info (schema_ver, bulk_extractor_ver) VALUES (1,1)", + "CREATE TABLE IF NOT EXISTS be_features (tablename VARCHAR,comment TEXT)", + "CREATE TABLE IF NOT EXISTS be_config (name VARCHAR,value VARCHAR)", + 0}; + +/* Create a feature table and note that it has been created in be_features */ +static const char *schema_tbl[] = { + "CREATE TABLE IF NOT EXISTS f_%s (offset INTEGER(12), path VARCHAR, feature_eutf8 TEXT, feature_utf8 TEXT, context_eutf8 TEXT)", + "CREATE INDEX IF NOT EXISTS f_%s_idx1 ON f_%s(offset)", + "CREATE INDEX IF NOT EXISTS f_%s_idx2 ON f_%s(feature_eutf8)", + "CREATE INDEX IF NOT EXISTS f_%s_idx3 ON f_%s(feature_utf8)", + "INSERT INTO be_features (tablename,comment) VALUES ('f_%s','')", + 0}; + +/* This creates the base histogram. Note that the SQL fails if the histogram exists */ +static const char *schema_hist[] = { + "CREATE TABLE h_%s (count INTEGER(12), feature_utf8 TEXT)", + "CREATE INDEX h_%s_idx1 ON h_%s(count)", + "CREATE INDEX h_%s_idx2 ON h_%s(feature_utf8)", + 0}; + +/* This performs the histogram operation */ +static const char *schema_hist1[] = { + "INSERT INTO h_%s select COUNT(*),feature_utf8 from f_%s GROUP BY feature_utf8", + 0}; + +static const char *schema_hist2[] = { + "INSERT INTO h_%s select sum(count),BEHIST(feature_utf8) from h_%s where BEHIST(feature_utf8)!='' GROUP BY BEHIST(feature_utf8)", + 0}; + +#endif +const char *feature_recorder::db_insert_stmt = "INSERT INTO f_%s (offset,path,feature_eutf8,feature_utf8,context_eutf8) VALUES (?1, ?2, ?3, ?4, ?5)"; +static const char *begin_transaction[] = {"BEGIN TRANSACTION",0}; +static const char *commit_transaction[] = {"COMMIT TRANSACTION",0}; +void feature_recorder::besql_stmt::insert_feature(const pos0_t &pos, + const std::string &feature, + const std::string &feature8, const std::string &context) +{ +#ifdef USE_SQLITE3 + assert(stmt!=0); + cppmutex::lock lock(Mstmt); // grab a lock + const std::string &path = pos.str(); + sqlite3_bind_int64(stmt, 1, pos.imageOffset()); // offset + sqlite3_bind_text(stmt, 2, path.data(), path.size(), SQLITE_STATIC); // path + sqlite3_bind_text(stmt, 3, feature.data(), feature.size(), SQLITE_STATIC); + sqlite3_bind_text(stmt, 4, feature8.data(), feature8.size(), SQLITE_STATIC); + sqlite3_bind_text(stmt, 5, context.data(), context.size(), SQLITE_STATIC); + if (sqlite3_step(stmt) != SQLITE_DONE) { + fprintf(stderr,"sqlite3_step failed\n"); + } + sqlite3_reset(stmt); +#endif +}; + +feature_recorder::besql_stmt::besql_stmt(BEAPI_SQLITE3 *db3,const char *sql):Mstmt(),stmt() +{ +#ifdef USE_SQLITE3 + assert(db3!=0); + assert(sql!=0); + sqlite3_prepare_v2(db3,sql, strlen(sql), &stmt, NULL); + assert(stmt!=0); +#endif +} + +feature_recorder::besql_stmt::~besql_stmt() +{ +#ifdef USE_SQLITE3 + assert(stmt!=0); + sqlite3_finalize(stmt); + stmt = 0; +#endif +} + +void feature_recorder_set::db_send_sql(BEAPI_SQLITE3 *db,const char **stmts, ...) +{ +#ifdef USE_SQLITE3 + assert(db!=0); + for(int i=0;stmts[i];i++){ + char *errmsg = 0; + char buf[65536]; + + va_list ap; + va_start(ap,stmts); + vsnprintf(buf,sizeof(buf),stmts[i],ap); + va_end(ap); + if(debug) std::cerr << "SQL: " << buf << "\n"; + if(sqlite3_exec(db,buf,NULL,NULL,&errmsg) != SQLITE_OK ) { + fprintf(stderr,"Error executing '%s' : %s\n",buf,errmsg); + exit(1); + } + } +#endif +} + +void feature_recorder_set::db_create_table(const std::string &name) +{ +#ifdef USE_SQLITE3 + assert(name.size()>0); + db_send_sql(db3,schema_tbl,name.c_str(),name.c_str()); +#endif +} + +BEAPI_SQLITE3 *feature_recorder_set::db_create_empty(const std::string &name) +{ +#ifdef USE_SQLITE3 + assert(name.size()>0); + std::string dbfname = outdir + "/" + name + SQLITE_EXTENSION; + if(debug) std::cerr << "create_feature_database " << dbfname << "\n"; + BEAPI_SQLITE3 *db=0; + if (sqlite3_open_v2(dbfname.c_str(), &db, + SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_FULLMUTEX, + 0)!=SQLITE_OK) { + std::cerr << "Cannot create database '" << dbfname << "': " << sqlite3_errmsg(db) << "\n"; + sqlite3_close(db); + exit(1); + } + return db; +#else + return 0; +#endif +} + +#pragma GCC diagnostic ignored "-Wmissing-noreturn" +void feature_recorder_set::db_create() +{ +#ifdef USE_SQLITE3 + assert(db3==0); + db3 = db_create_empty("report"); + db_send_sql(db3,schema_db); +#else + std::cerr << "*** CANNOT CREATE SQLITE3 DATABASE ***\n"; + std::cerr << "*** Compiled without libsqlite ***\n"; + assert(0 && debug); // prevent debug from being not used +#endif +} + +void feature_recorder_set::db_close() +{ +#ifdef USE_SQLITE3 + if(db3){ + if(debug) std::cerr << "db_close()\n"; + sqlite3_close(db3); + db3 = 0; + } +#endif +} + +void feature_recorder_set::db_transaction_begin() +{ + cppmutex::lock lock(Min_transaction); + if(!in_transaction){ + db_send_sql(db3,begin_transaction); + in_transaction = true; + } +} + +void feature_recorder_set::db_transaction_commit() +{ + cppmutex::lock lock(Min_transaction); + if(in_transaction){ + db_send_sql(db3,commit_transaction); + in_transaction = false; + } else { + std::cerr << "No transaction to commit\n"; + } +} + +/* Hook for writing feature to SQLite3 database */ +void feature_recorder::db_write0(const pos0_t &pos0,const std::string &feature,const std::string &context) +{ + /** + * Note: this is not very efficient, passing through a quoted feature and then unquoting it. + * We could make this more efficient. + */ + std::string *feature8 = HistogramMaker::convert_utf16_to_utf8(feature_recorder::unquote_string(feature)); + assert(bs!=0); + bs->insert_feature(pos0,feature, + feature8 ? *feature8 : feature, + flag_set(feature_recorder::FLAG_NO_CONTEXT) ? "" : context); + if (feature8) delete feature8; +} + +/* Hook for writing histogram + */ +#ifdef USE_SQLITE3 +static int callback_counter(void *param, int argc, char **argv, char **azColName) +{ + int *counter = reinterpret_cast(param); + (*counter)++; + return 0; +} + +static void behist(sqlite3_context *ctx,int argc,sqlite3_value**argv) +{ + const histogram_def *def = reinterpret_cast(sqlite3_user_data(ctx)); + if(debug) std::cerr << "behist feature=" << def->feature << " suffix=" + << def->suffix << " argc=" << argc << "value = " << sqlite3_value_text(argv[0]) << "\n"; + std::string new_feature(reinterpret_cast(sqlite3_value_text(argv[0]))); + if (def->reg.search(new_feature,&new_feature,0,0)) { + sqlite3_result_text(ctx,new_feature.c_str(),new_feature.size(),SQLITE_TRANSIENT); + } +} +#endif + +void feature_recorder::dump_histogram_db(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const +{ +#ifdef USE_SQLITE3 + /* First check to see if there exists a feature histogram summary. If not, make it */ + std::string query = "SELECT name FROM sqlite_master WHERE type='table' AND name='h_" + def.feature +"'"; + char *errmsg=0; + int rowcount=0; + if (sqlite3_exec(fs.db3,query.c_str(),callback_counter,&rowcount,&errmsg)){ + std::cerr << "sqlite3: " << errmsg << "\n"; + return; + } + if (rowcount==0){ + const char *feature = def.feature.c_str(); + fs.db_send_sql(fs.db3,schema_hist, feature, feature); // creates the histogram + fs.db_send_sql(fs.db3,schema_hist1, feature, feature); // creates the histogram + } +#ifdef HAVE_SQLITE3_CREATE_FUNCTION_V2 + /* Now create the summarized histogram for the regex, if it is not existing, but only if we have + * sqlite3_create_function_v2 + */ + if (def.pattern.size()>0){ + /* Create the database where we will add the histogram */ + std::string hname = def.feature + "_" + def.suffix; + + /* Remove any "-" characters if present */ + for(size_t i=0;i &ret){} + +int main(int argc,char **argv) +{ + const char *dbfile = "test.sql3"; + char *errmsg = 0; + sqlite3 *db=0; + + feature_recorder_set fs(0,my_hasher); + + unlink(dbfile); + fs.db_create(); + if(1){ + /* Create an email table */ + fs.db_create_table("email"); + + /* Lets throw a million features into the table as a test */ + //sqlite3_exec(db,"BEGIN TRANSACTION",NULL,NULL,&errmsg); + beapi_sql_stmt s(db,"email"); + for(int i=0;i<1000000;i++){ + pos0_t p; + pos0_t p1 = p+i; + + if(i%10000==0) printf("i=%d\n",i); + + char feature[64]; + snprintf(feature,sizeof(feature),"user%d@company.com",i); + char context[64]; + snprintf(context,sizeof(context),"this is the context user%d@company.com yes it is!",i); + //insert_statement(stmt,p1,feature,context); + } + //sqlite3_exec(db,"COMMIT TRANSACTION",NULL,NULL,&errmsg); + } + fs.db_close(); +} +#endif + diff -Nru tcpflow-1.4.4+repack1/src/be13_api/.gitignore tcpflow-1.4.5+repack1/src/be13_api/.gitignore --- tcpflow-1.4.4+repack1/src/be13_api/.gitignore 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/.gitignore 2015-08-26 03:35:59.000000000 +0000 @@ -19,3 +19,6 @@ .deps .dirstamp _deps +a.out +*.sql3 +stand diff -Nru tcpflow-1.4.4+repack1/src/be13_api/histogram.cpp tcpflow-1.4.5+repack1/src/be13_api/histogram.cpp --- tcpflow-1.4.4+repack1/src/be13_api/histogram.cpp 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/histogram.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -3,6 +3,8 @@ * Maintain a histogram for Unicode strings provided as UTF-8 and UTF-16 encodings. * Track number of each coding provided. * + * TK: Reimplement top-n with a priority queue. + * http://www.cplusplus.com/reference/queue/priority_queue/ */ #include "config.h" @@ -15,8 +17,9 @@ ostream & operator << (ostream &os, const HistogramMaker::FrequencyReportVector &rep){ for(HistogramMaker::FrequencyReportVector::const_iterator i = rep.begin(); i!=rep.end();i++){ - os << "n=" << i->tally.count << "\t" << validateOrEscapeUTF8(i->value, true, true); - if(i->tally.count16>0) os << "\t(utf16=" << i->tally.count16<<")"; + const HistogramMaker::ReportElement &r = *(*i); + os << "n=" << r.tally.count << "\t" << validateOrEscapeUTF8(r.value, true, true); + if(r.tally.count16>0) os << "\t(utf16=" << r.tally.count16<<")"; os << "\n"; } return os; @@ -26,24 +29,31 @@ { FrequencyReportVector *rep = new FrequencyReportVector(); for(HistogramMap::const_iterator it = h.begin(); it != h.end(); it++){ - rep->push_back(ReportElement(it->first,it->second)); + rep->push_back(new ReportElement(it->first,it->second)); } sort(rep->begin(),rep->end(),ReportElement::compare); return rep; } +/* This would be better done with a priority queue */ HistogramMaker::FrequencyReportVector *HistogramMaker::makeReport(int topN) const { - HistogramMaker::FrequencyReportVector *r2 = makeReport(); // gets a new report + HistogramMaker::FrequencyReportVector *r2 = makeReport(); // gets a new report HistogramMaker::FrequencyReportVector::iterator i = r2->begin(); while(topN>0 && i!=r2->end()){ // iterate through the first set i++; topN--; } + + /* Delete the elements we won't use */ + for(HistogramMaker::FrequencyReportVector::iterator j=i;j!=r2->end();j++){ + delete (*j); + } r2->erase(i,r2->end()); return r2; } +/* static */ bool HistogramMaker::looks_like_utf16(const std::string &str,bool &little_endian) { if((uint8_t)str[0]==0xff && (uint8_t)str[1]==0xfe){ @@ -73,6 +83,56 @@ } /** + * Converts a utf16 with a byte order to utf8, returning an ALLOCATED STRING if conversion is + * successful, and returning 0 if it is not. + */ +/* static */ +std::string *HistogramMaker::convert_utf16_to_utf8(const std::string &key,bool little_endian) +{ + /* re-image this string as UTF16*/ + std::wstring utf16; + for(size_t i=0;isize()>0) { + size_t nullpos = tempKey->find('\000'); + if(nullpos==string::npos) break; + tempKey->erase(nullpos,1); + } + } catch(utf8::invalid_utf16){ + /* Exception; bad UTF16 encoding */ + delete tempKey; + tempKey = 0; // give up on temp key; otherwise its invalidated below + return 0; + } + return tempKey; +} + +std::string *HistogramMaker::convert_utf16_to_utf8(const std::string &key) +{ + bool little_endian=false; + if(looks_like_utf16(key,little_endian)){ + return convert_utf16_to_utf8(key,little_endian); + } + return 0; +} + +std::string *HistogramMaker::make_utf8(const std::string &key) +{ + std::string *utf8 = convert_utf16_to_utf8(key); + if(utf8==0) utf8 = new std::string(key); + return utf8; +} + +/** * Takes a string (the key) and adds it to the histogram. * automatically determines if the key is UTF-16 and converts * it to UTF8 if so. @@ -96,31 +156,11 @@ bool found_utf16 = false; bool little_endian=false; if(looks_like_utf16(*keyToAdd,little_endian)){ - /* re-image this string as UTF16*/ - found_utf16 = true; - std::wstring utf16; - for(size_t i=0;isize()>0) { - size_t nullpos = tempKey->find('\000'); - if(nullpos==string::npos) break; - tempKey->erase(nullpos,1); - } - keyToAdd = tempKey; - } catch(utf8::invalid_utf16){ - /* Exception; bad UTF16 encoding */ - delete tempKey; - tempKey = 0; // give up on temp key; otherwise its invalidated below - } + tempKey = convert_utf16_to_utf8(*keyToAdd,little_endian); + if(tempKey){ + keyToAdd = tempKey; + found_utf16 = true; + } } /* If any conversion is necessary AND we have not converted key from UTF-16 to UTF-8, diff -Nru tcpflow-1.4.4+repack1/src/be13_api/histogram.h tcpflow-1.4.5+repack1/src/be13_api/histogram.h --- tcpflow-1.4.4+repack1/src/be13_api/histogram.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/histogram.h 2015-08-26 03:35:59.000000000 +0000 @@ -81,13 +81,18 @@ */ struct ReportElement { ReportElement(std::string aValue,histogramTally aTally):value(aValue),tally(aTally){ } - std::string value; // UTF-8 - histogramTally tally; - static bool compare(const ReportElement &e1,const ReportElement &e2) { + const std::string value; // UTF-8 + histogramTally tally; + static bool compare_ref(const ReportElement &e1,const ReportElement &e2) { if (e1.tally.count > e2.tally.count) return true; if (e1.tally.count < e2.tally.count) return false; return e1.value < e2.value; } + static bool compare(const ReportElement *e1,const ReportElement *e2) { + if (e1->tally.count > e2->tally.count) return true; + if (e1->tally.count < e2->tally.count) return false; + return e1->value < e2->value; + } virtual ~ReportElement(){}; }; @@ -104,19 +109,25 @@ */ static bool looks_like_utf16(const std::string &str,bool &little_endian); + /* These all allocate a string that must be freed */ + + static std::string *convert_utf16_to_utf8(const std::string &str); + static std::string *convert_utf16_to_utf8(const std::string &str,bool little_endian); + static std::string *make_utf8(const std::string &key); + HistogramMaker(uint32_t flags_):h(),flags(flags_){} void clear(){h.clear();} void add(const std::string &key); // adds a string to the histogram count - /** A FrequencyReportVector is a vector of report elements when the report is generatedn. + /** A FrequencyReportVector is a vector of report elements when the report is generated. */ - typedef std::vector FrequencyReportVector; + typedef std::vector FrequencyReportVector; /** makeReport() makes a report and returns a * FrequencyReportVector. */ FrequencyReportVector *makeReport() const; // return a report with all of them FrequencyReportVector *makeReport(int topN) const; // returns just the topN - virtual ~HistogramMaker(){} + virtual ~HistogramMaker(){}; }; std::ostream & operator <<(std::ostream &os,const HistogramMaker::FrequencyReportVector &rep); diff -Nru tcpflow-1.4.4+repack1/src/be13_api/Makefile tcpflow-1.4.5+repack1/src/be13_api/Makefile --- tcpflow-1.4.4+repack1/src/be13_api/Makefile 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/Makefile 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,8 @@ +CPPFLAGS=-I.. -I../.. -I. -I/usr/local/include -I/opt/local/include -L/usr/local/lib -L/opt/local/lib +all: + (cd ..; $(MAKE)) + + +stand: feature_sql +feature_sql: feature_recorder_sql.cpp beregex.cpp + g++ -o stand feature_recorder_sql.cpp beregex.cpp histogram.cpp unicode_escape.cpp -DSTAND -lsqlite3 $(CPPFLAGS) -lssl -ltre -lcrypto diff -Nru tcpflow-1.4.4+repack1/src/be13_api/Makefile.defs tcpflow-1.4.5+repack1/src/be13_api/Makefile.defs --- tcpflow-1.4.4+repack1/src/be13_api/Makefile.defs 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/Makefile.defs 2015-08-26 03:35:59.000000000 +0000 @@ -9,6 +9,7 @@ be13_api/feature_recorder.h \ be13_api/feature_recorder_set.cpp \ be13_api/feature_recorder_set.h \ + be13_api/feature_recorder_sql.cpp \ be13_api/histogram.h \ be13_api/histogram.cpp \ be13_api/net_ethernet.h \ diff -Nru tcpflow-1.4.4+repack1/src/be13_api/plugin.cpp tcpflow-1.4.5+repack1/src/be13_api/plugin.cpp --- tcpflow-1.4.4+repack1/src/be13_api/plugin.cpp 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/plugin.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -46,35 +46,6 @@ *** misc support ****************************************************************/ -#ifndef HAVE_ERR -#include -// noreturn attribute to avoid warning with GCC on Linux -static void err(int eval,const char *fmt,...) __attribute__ ((noreturn)); -static void err(int eval,const char *fmt,...) -{ - va_list ap; - va_start(ap,fmt); - vfprintf(stderr,fmt,ap); - va_end(ap); - fprintf(stderr,": %s\n",strerror(errno)); - exit(eval); -} -#endif - -#ifndef HAVE_ERRX -#include -// noreturn attribute to avoid warning with GCC on Linux -static void errx(int eval,const char *fmt,...) __attribute__ ((noreturn)); -static void errx(int eval,const char *fmt,...) -{ - va_list ap; - va_start(ap,fmt); - vfprintf(stderr,fmt,ap); - fprintf(stderr,"%s\n",strerror(errno)); - va_end(ap); - exit(eval); -} -#endif /**************************************************************** *** SCANNER PLUG-IN SYSTEM @@ -158,7 +129,7 @@ * As part of scanner loading: * - pass configuration to the scanner * - feature files that the scanner requires - * - Histograms that the scanner makes + * - Histograms that the scanner makes (see feature_recorder_set) * This is called before scanners are enabled or disabled, so the pcap handlers * need to be set afterwards */ @@ -173,7 +144,9 @@ * we use static values so that the sbuf is not constantly being created and destroyed. */ static const sbuf_t sbuf; - static feature_recorder_set fs(feature_recorder_set::SET_DISABLED); // dummy + static feature_recorder_set fs(feature_recorder_set::SET_DISABLED,feature_recorder_set::null_hasher, + feature_recorder_set::NO_INPUT, + feature_recorder_set::NO_OUTDIR); // dummy // // Each scanner's params are stored in a scanner_def object that @@ -207,7 +180,8 @@ /* Figure out the function name */ size_t extloc = fn.rfind('.'); if(extloc==std::string::npos){ - errx(1,"Cannot find '.' in %s",fn.c_str()); + fprintf(stderr,"Cannot find '.' in %s",fn.c_str()); + exit(1); } std::string func_name = fn.substr(0,extloc); size_t slashloc = func_name.rfind('/'); @@ -215,26 +189,36 @@ slashloc = func_name.rfind('\\'); if(slashloc!=std::string::npos) func_name = func_name.substr(slashloc+1); - std::cout << "Loading: " << fn << " (" << func_name << ")\n"; + if(debug) std::cout << "Loading: " << fn << " (" << func_name << ")\n"; scanner_t *scanner = 0; #if defined(HAVE_DLOPEN) void *lib=dlopen(fn.c_str(), RTLD_LAZY); if(lib==0){ - errx(1,"dlopen: %s\n",dlerror()); + fprintf(stderr,"dlopen: %s\n",dlerror()); + exit(1); } /* Resolve the symbol */ scanner = (scanner_t *)dlsym(lib, func_name.c_str()); - if(scanner==0) errx(1,"dlsym: %s\n",dlerror()); + if(scanner==0){ + fprintf(stderr,"dlsym: %s\n",dlerror()); + exit(1); + } #elif defined(HAVE_LOADLIBRARY) /* Use Win32 LoadLibrary function */ /* See http://msdn.microsoft.com/en-us/library/ms686944(v=vs.85).aspx */ HINSTANCE hinstLib = LoadLibrary(TEXT(fn.c_str())); - if(hinstLib==0) errx(1,"LoadLibrary(%s) failed",fn.c_str()); + if(hinstLib==0){ + fprintf(stderr,"LoadLibrary(%s) failed",fn.c_str()); + exit(1); + } scanner = (scanner_t *)GetProcAddress(hinstLib,func_name.c_str()); - if(scanner==0) errx(1,"GetProcAddress(%s) failed",func_name.c_str()); + if(scanner==0){ + fprintf(stderr,"GetProcAddress(%s) failed",func_name.c_str()); + exit(1); + } #else std::cout << " ERROR: Support for loadable libraries not enabled\n"; return; @@ -253,7 +237,8 @@ { DIR *dirp = opendir(dirname.c_str()); if(dirp==0){ - err(1,"Cannot open directory %s:",dirname.c_str()); + fprintf(stderr,"Cannot open directory %s:",dirname.c_str()); + exit(1); } struct dirent *dp; while ((dp = readdir(dirp)) != NULL){ @@ -420,18 +405,12 @@ } } -/**************************************************************** - *** PHASE HISTOGRAM (formerly phase 3): Create the histograms - ****************************************************************/ +/************************************ + *** HELP and option processing *** + ************************************/ -/** - * Note currently we have two kinds of histograms: - * post-processing histograms specified by the histogram library, and in-memory histograms. - * that are really only used by scan_bulk. - */ - -/* option processing */ /* Get the config and build the help strings at the same time! */ + std::stringstream scanner_info::helpstream; void scanner_info::get_config(const scanner_info::config_t &c, const std::string &n,std::string *val,const std::string &help) @@ -598,6 +577,8 @@ const pos0_t &pos0 = sp.sbuf.pos0; class feature_recorder_set &fs = sp.fs; + fs.heartbeat(); // note that we are alive + { /* note the maximum depth that we've seen */ cppmutex::lock lock(max_depth_seenM); diff -Nru tcpflow-1.4.4+repack1/src/be13_api/README.md tcpflow-1.4.5+repack1/src/be13_api/README.md --- tcpflow-1.4.4+repack1/src/be13_api/README.md 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/README.md 2015-08-26 03:35:59.000000000 +0000 @@ -61,11 +61,8 @@ Summary: -$ git checkout -b newbranch -$ git checkout master -$ git merge newbranch -$ git branch -d newbranch +$ git checkout -b tmp; git checkout master; git merge tmp; git branch -d tmp -or: +Followed by: -$ git checkout -b tmp ; git checkout master ; git merge tmp ; git branch -d tmp ; git push git@github.com:simsong/be13_api.git master +$ git push git@github.com:simsong/be13_api.git master diff -Nru tcpflow-1.4.4+repack1/src/be13_api/sbuf.h tcpflow-1.4.5+repack1/src/be13_api/sbuf.h --- tcpflow-1.4.4+repack1/src/be13_api/sbuf.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/sbuf.h 2015-08-26 03:35:59.000000000 +0000 @@ -59,19 +59,22 @@ * in a 64-bit number. */ -inline int64_t stoi64(std::string str){ +inline int64_t stoi64(std::string str) +{ int64_t val(0); std::istringstream ss(str); ss >> val; return val; } + class pos0_t { public: - std::string path; /* forensic path of decoders*/ - uint64_t offset; /* location of buf[0] */ + const std::string path; /* forensic path of decoders*/ + const uint64_t offset; /* location of buf[0] */ explicit pos0_t():path(""),offset(0){} pos0_t(std::string s):path(s),offset(0){} + pos0_t(std::string s,uint64_t o):path(s),offset(o){} pos0_t(const pos0_t &obj):path(obj.path),offset(obj.offset){ } std::string str() const { // convert to a string, with offset included std::stringstream ss; @@ -110,26 +113,26 @@ } return desc; } + uint64_t imageOffset() const { // return the offset from start of disk + if(path.size()>0) return stoi64(path); + return offset; + } + /** * Return a new position that's been shifted by an offset */ pos0_t shift(int64_t s) const { if(s==0) return *this; - pos0_t ret; size_t p = path.find('-'); if(p==std::string::npos){ // no path - ret.path=""; - ret.offset = offset + s; - return ret; + return pos0_t("",offset+s); } /* Figure out the value of the shift */ int64_t baseOffset = stoi64(path.substr(0,p-1)); std::stringstream ss; ss << (baseOffset+s) << path.substr(p); - ret.path = ss.str(); - ret.offset = offset; - return ret; + return pos0_t(ss.str(),offset); } }; @@ -143,18 +146,15 @@ /** Append a string (subdir). * The current offset is a prefix to the subdir. */ -inline class pos0_t operator +(pos0_t pos0,const std::string &subdir) { +inline class pos0_t operator +(pos0_t pos,const std::string &subdir) { std::stringstream ss; - ss << pos0.offset; - pos0.path += (pos0.path.size()>0 ? "-" : "") + ss.str() + "-" + subdir; - pos0.offset = 0; - return pos0; + ss << pos.path << (pos.path.size()>0 ? "-" : "") << pos.offset << "-" << subdir; + return pos0_t(ss.str(),0); }; /** Adding an offset */ -inline class pos0_t operator +(pos0_t pos0,int64_t delta) { - pos0.offset += delta; - return pos0; +inline class pos0_t operator +(pos0_t pos,int64_t delta) { + return pos0_t(pos.path,pos.offset+delta); }; /** \name Comparision operations diff -Nru tcpflow-1.4.4+repack1/src/be13_api/utils.h tcpflow-1.4.5+repack1/src/be13_api/utils.h --- tcpflow-1.4.4+repack1/src/be13_api/utils.h 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/utils.h 2015-08-26 03:35:59.000000000 +0000 @@ -48,14 +48,18 @@ #endif #ifndef HAVE_LOCALTIME_R +#ifdef __MINGW32__ +#undef localtime_r +#endif void localtime_r(time_t *t,struct tm *tm); #endif - -// gmtime.h definition moved to bulk_extractor_i.h -//#ifndef HAVE_GMTIME_R -//void gmtime_r(time_t *t,struct tm *tm); -//#endif +#ifndef HAVE_GMTIME_R +#ifdef __MINGW32__ +#undef gmtime_r +#endif +void gmtime_r(time_t *t,struct tm *tm); +#endif int64_t get_filesize(int fd); diff -Nru tcpflow-1.4.4+repack1/src/be13_api/word_and_context_list.cpp tcpflow-1.4.5+repack1/src/be13_api/word_and_context_list.cpp --- tcpflow-1.4.4+repack1/src/be13_api/word_and_context_list.cpp 2014-01-10 05:19:21.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/be13_api/word_and_context_list.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -1,7 +1,11 @@ +/** + * class word_and_context_list reads from disk and maintains in memory + * a data structure that is used for the stop list and alert list. + */ + #include "config.h" #include "word_and_context_list.h" #include "beregex.h" -//#include "feature_recorder.h" void word_and_context_list::add_regex(const std::string &pat) { @@ -36,12 +40,6 @@ while(getline(i,line)){ line_counter++; if(line.size()==0) continue; -// if(line_counter==1 && line.size()>3 -// && line[0]==feature_recorder::UTF8_BOM[0] -// && line[1]==feature_recorder::UTF8_BOM[1] -// && line[2]==feature_recorder::UTF8_BOM[2]){ -// line = line.substr(3); // remove the UTF8 BOM -// } if(line[0]=='#') continue; // it's a comment if((*line.end())=='\r'){ line.erase(line.end()); /* remove the last character if it is a \r */ diff -Nru tcpflow-1.4.4+repack1/src/datalink.cpp tcpflow-1.4.5+repack1/src/datalink.cpp --- tcpflow-1.4.4+repack1/src/datalink.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/datalink.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -245,7 +245,7 @@ #ifdef DLT_LINUX_SLL { dl_linux_sll, DLT_LINUX_SLL }, #endif -#ifndef WIN32 +#if defined(USE_WIFI) && !defined(WIN32) { dl_ieee802_11_radio, DLT_IEEE802_11 }, { dl_ieee802_11_radio, DLT_IEEE802_11_RADIO }, { dl_prism, DLT_PRISM_HEADER}, diff -Nru tcpflow-1.4.4+repack1/src/datalink_wifi.cpp tcpflow-1.4.5+repack1/src/datalink_wifi.cpp --- tcpflow-1.4.4+repack1/src/datalink_wifi.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/datalink_wifi.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -42,7 +42,9 @@ void dl_prism(u_char *user, const struct pcap_pkthdr *h, const u_char *p) { +#ifdef DLT_PRISM_HEADER theWcap.handle_packet(&TFCB::theTFCB,DLT_PRISM_HEADER,h,p); +#endif } diff -Nru tcpflow-1.4.4+repack1/src/dfxml/.gitignore tcpflow-1.4.5+repack1/src/dfxml/.gitignore --- tcpflow-1.4.4+repack1/src/dfxml/.gitignore 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/.gitignore 2015-08-26 03:35:59.000000000 +0000 @@ -1,4 +1,5 @@ *.o +*.lo *.pyc *~ .deps diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/allocation_counter.py tcpflow-1.4.5+repack1/src/dfxml/python/allocation_counter.py --- tcpflow-1.4.4+repack1/src/dfxml/python/allocation_counter.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/allocation_counter.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +""" +For a disk image or DFXML file, this program produces a cross-tabulation of the allocation state of each file's inode and name. +""" + +__version__ = "0.1.1" +#Version 0.2.0: +# * Tabular output in HTML +# * Tabular output in LaTeX + +import Objects +import make_differential_dfxml + +import collections +import logging +import sys +import xml.etree.ElementTree as ET +import os + +_logger = logging.getLogger(os.path.basename(__file__)) + +def main(): + counter = collections.defaultdict(lambda: 0) + prev_obj = None + for (event, obj) in Objects.iterparse(args.input_image): + if isinstance(obj, Objects.FileObject): + if args.ignore_virtual_files and make_differential_dfxml.ignorable_name(obj.filename): + continue + counter[(obj.alloc_inode, obj.alloc_name)] += 1 + + #Inspect weird data + if args.debug and obj.alloc_inode is None and obj.alloc_name is None: + _logger.debug("Encountered a file with all-null allocation.") + _logger.debug("Event: %r." % event) + _logger.debug("Previous object: %s." % ET.tostring(prev_obj.to_Element())) + _logger.debug("Current object: %s." % ET.tostring(obj.to_Element())) + prev_obj = obj + print(repr(counter)) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--ignore-virtual-files", action="store_true", help="Use the same file-ignoring rules as make_differential_dfxml.py.") + parser.add_argument("-d", "--debug", action="store_true", help="Enable debug printing.") + parser.add_argument("input_image", help="Disk image, or DFXML file.") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/break_out_diffs_by_anno.py tcpflow-1.4.5+repack1/src/dfxml/python/break_out_diffs_by_anno.py --- tcpflow-1.4.4+repack1/src/dfxml/python/break_out_diffs_by_anno.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/break_out_diffs_by_anno.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +""" +This program reads a DFXML file with differential annotations and produces a table. + +Columns: FileObject annotation (is it a new file? renamed? etc.). +Rows: Counts of instances of a property being changed per FileObject annotation. One row per FileObject direct-child element. +""" + +__version__ = "0.1.0" + +import Objects +import sys +import collections + +def main(): + #Key: (annotation, histogram) + hist = collections.defaultdict(int) + for (event, obj) in Objects.iterparse(sys.argv[1]): + if event != "end" or not isinstance(obj, Objects.FileObject): + continue + #Loop through annotations + for anno in obj.annos: + #Loop through diffs + for diff in obj.diffs: + hist[(anno, diff)] += 1 + + annos = Objects.FileObject._diff_attr_names.keys() + print(""" + + + + +""") + for anno in annos: + print(" " % anno) + print(""" + + + + +""") + for diff in sorted(Objects.FileObject._all_properties): + print(" ") + if diff in Objects.FileObject._incomparable_properties: + continue + print(" " % diff) + for anno in annos: + print(" " % hist[(anno,diff)]) + print(" ") + print(""" + +
Property%s
%s%d
+""") + +if __name__ == "__main__": + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/cat_fileobjects.py tcpflow-1.4.5+repack1/src/dfxml/python/cat_fileobjects.py --- tcpflow-1.4.4+repack1/src/dfxml/python/cat_fileobjects.py 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/cat_fileobjects.py 2015-08-26 03:35:59.000000000 +0000 @@ -1,15 +1,18 @@ #!/usr/bin/env python3 #Make a new DFXML file of all fileobjects in an input DFXML file. -__version__ = "0.2.0" +__version__ = "0.2.1" import sys import xml.etree.ElementTree as ET import dfxml import logging +import os + +_logger = logging.getLogger(os.path.basename(__file__)) if sys.version < "3": - logging.error("Due to Unicode issues with Python 2's ElementTree, Python 3 and up is required.\n") + _logger.error("Due to Unicode issues with Python 2's ElementTree, Python 3 and up is required.\n") exit(1) def main(): @@ -36,15 +39,15 @@ xs = [] for fi in dfxml.iter_dfxml(xmlfile=open(args.filename, "rb"), preserve_elements=True): - logging.debug("Processing: %s" % str(fi)) + _logger.debug("Processing: %s" % str(fi)) if args.cache: xs.append(fi.xml_element) else: - logging.debug("Printing without cache: %s" % str(fi)) + _logger.debug("Printing without cache: %s" % str(fi)) print(dfxml.ET_tostring(fi.xml_element, encoding="unicode")) if args.cache: for x in xs: - logging.debug("Printing with cache: %s" % str(fi)) + _logger.debug("Printing with cache: %s" % str(fi)) print(dfxml.ET_tostring(x, encoding="unicode")) print("""""") diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/cat_partitions.py tcpflow-1.4.5+repack1/src/dfxml/python/cat_partitions.py --- tcpflow-1.4.4+repack1/src/dfxml/python/cat_partitions.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/cat_partitions.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +""" +Writes to stdout a DFXML document that is the concatenation of each input DFXML document. + +Assumes the input DFXML has at most one volume per document. + +Partition numbers, offsets, and byte run img_offset attributes are all overwritten with the correct value assuming the partition was carved at the offset passed on the command line. + +That is, this command: + + $0 32256:fiout.xml 1073741824:fiout.xml + +will create a single DFXML file with two volumes and their file objects contained. +""" + +__version__ = "0.1.1" + +import Objects +import logging +import os +import sys +import xml.etree.ElementTree as ET + +_logger = logging.getLogger(os.path.basename(__file__)) + +def main(): + d = Objects.DFXMLObject(version="1.1.0") + + d.command_line = " ".join(sys.argv) + + _offsets_and_pxml_paths = [] + for (lxfno, lxf) in enumerate(args.labeled_xml_file): + lxf_parts = lxf.split(":") + if len(lxf_parts) != 2 or not lxf_parts[0].isdigit(): + raise ValueError("Malformed argument in labeled_xml_file. Expecting space-delimited list of ':'. This entry doesn't work: %r." % lxf) + offset = int(lxf_parts[0]) + path = lxf_parts[1] + _offsets_and_pxml_paths.append((offset,path)) + offsets_and_pxml_paths = sorted(_offsets_and_pxml_paths) + + for (pxml_path_index, (offset, pxml_path)) in enumerate(offsets_and_pxml_paths): + _logger.debug("Running on path %r." % pxml_path) + pdo = Objects.parse(pxml_path) + + building_volume = None + #Fetch or build volume we'll append + if len(pdo.volumes) > 1: + raise ValueError("An input DFXML document has multiple volumes; this script assumes each input document only has one. The document here has %d: %r." % (len(pdo.volumes), pxml_path)) + elif len(pdo.volumes) == 0: + v = Objects.VolumeObject() + building_volume = True + else: + v = pdo.volumes[0] + building_volume = False + + v.partition_offset = offset + + #Accumulate namespaces + for (prefix, url) in pdo.iter_namespaces(): + d.add_namespace(prefix, url) + + for obj in pdo: + #Force-update image offsets in byte runs + for brs_prop in ["data_brs", "name_brs", "inode_brs"]: + if hasattr(obj, brs_prop): + brs = getattr(obj, brs_prop) + if brs is None: + continue + for br in brs: + if not br.fs_offset is None: + br.img_offset = br.fs_offset + offset + #For files, set partition identifier and attach to partition + if isinstance(obj, Objects.FileObject): + obj.partition = pxml_path_index + 1 + if building_volume: + v.append(obj) + + #Collect the constructed and/or updated volume + d.append(v) + + d.print_dfxml() + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", help="Enable debug printing", action="store_true") + parser.add_argument("--image-path", help="Path to the source image file to record in the resulting DFXML.") + parser.add_argument("labeled_xml_file", help="List of DFXML files, each colon-prefixed with the partition's offset in bytes (e.g. '32256:fiout.dfxml')", nargs="+") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/ChangeLog tcpflow-1.4.5+repack1/src/dfxml/python/ChangeLog --- tcpflow-1.4.4+repack1/src/dfxml/python/ChangeLog 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/ChangeLog 2015-08-26 03:35:59.000000000 +0000 @@ -1,3 +1,40 @@ +2014-06-04 + + * hash_sectors.py: Add optional tail padding. + +2014-05-07 + + * Objects.py: Corrected buffering return rate for "fill" byte runs. + +2014-04-17 + + * Demos: Added a new demonstration program showing construction of a MACCr timeline with the Objects interface. Unit tests updated. + * Objects.py: Updated documentation. + +2014-03-07 + + * Objects.py: API change - switched "meta" byte run facet to "inode", keeping in line with "alloc_inode". Unit tests updated. + +2014-03-06 + + * Objects.py: Created. New object-oriented bindings for DFXML. Primary usage difference is better in-memory manipulation, including entire DFXML document creation without using a DFXML file; property getters and setters, instead of function calls; and a different approach to serializers and de-serializers from DFXML files. + * make_differential_dfxml.py: Created. Revised approach to taking differences of disk images. Creates a differential DFXML file, or an in-memory DFXML Object if imported as a library. + * summarize_differential_dfxml.py: Created. Reports on disk image differences, as previously done by idifference.py. + * idifference2.py: Created. A re-implementation of idifference.py, using the new Object bindings, make_differential_dfxml.py, and summarize_differential_dfxml.py. + * Differential analysis scripts: Created allocation_counter.py, break_out_diffs_by_anno.py, report_silent_changes.py + * cat_partitions.py: Created. Concatenates single-partition DFXML files into one DFXML file. + * hash_sectors.py: Created. Hashes sectors of files, storing output in a SQLite database. + * Extractor.py: Created. Library for general file extraction from a disk image. A more modular iextract.py. + * Makefile: Unit tests added. + * test_Objects: Created. Unit test directory for new Object bindings. + * samples: More DFXML samples added for differencing tests. + * Logging: Modules using the logging module now report the file that contained the call to each log message. Previously, all calls were done with the 'root' logger. + * dfxml_tool.py: Unit test now runs on a smaller directory tree. + * idifference.py: Corrected a counting bug. + * dfxml.py: Time objects can now be instantiated from floats. + * dfxml.py: Allocation can now be parsed at a more granular level - inode and name, instead of simply "allocated." + * dfxml.py: DFXML files with "original_fileobject" elements attached to fileobject elements can now be parsed. + 2013-11-02 * idifference.py: Imported null-variable tests, and corrected a variable reference, to help idifference to be used as a module diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/demo_mac_timeline_objects.py tcpflow-1.4.5+repack1/src/dfxml/python/demo_mac_timeline_objects.py --- tcpflow-1.4.4+repack1/src/dfxml/python/demo_mac_timeline_objects.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/demo_mac_timeline_objects.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# produce a MAC-times timeline using the DFXML Objects interface. +# works under either Python2 or Python3 +import Objects +import sys + +def main(): + if len(sys.argv) < 2: + print("Usage: {} ".format(sys.argv[0])) + exit(1) + + timeline = [] + + for (event, obj) in Objects.iterparse( sys.argv[1] ): + #Only work on FileObjects + if not isinstance(obj, Objects.FileObject): + continue + if not obj.mtime is None: timeline.append([obj.mtime, obj.filename," modified"]) + if not obj.crtime is None: timeline.append([obj.crtime,obj.filename," created"]) + if not obj.ctime is None: timeline.append([obj.ctime, obj.filename," changed"]) + if not obj.atime is None: timeline.append([obj.atime, obj.filename," accessed"]) + + timeline.sort() + + for record in timeline: + print("\t".join( map(str, record)) ) + +if __name__ == "__main__": + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/dfxml.py tcpflow-1.4.5+repack1/src/dfxml/python/dfxml.py --- tcpflow-1.4.4+repack1/src/dfxml/python/dfxml.py 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/dfxml.py 2015-08-26 03:35:59.000000000 +0000 @@ -34,13 +34,18 @@ from subprocess import Popen,PIPE import base64 import hashlib +import os import datetime +import logging +_logger = logging.getLogger(os.path.basename(__file__)) + __version__ = "1.0.1" tsk_virtual_filenames = set(['$FAT1','$FAT2']) +XMLNS_DC = "http://purl.org/dc/elements/1.1/" XMLNS_DFXML = "http://www.forensicswiki.org/wiki/Category:Digital_Forensics_XML" XMLNS_DELTA = "http://www.forensicswiki.org/wiki/Forensic_Disk_Differencing" @@ -337,9 +342,9 @@ #(The check for 13:15 gets the 14th and 15th characters, since the day can be single- or double-digit.) self.datetime_ = rfc822Tdatetime(val) else: - #Maybe the data are a string-wrapped int? + #Maybe the data are a string-wrapped int or float? #If this fails, the string format is completely unexpected, so just raise an error. - self.timestamp_ = int(val) + self.timestamp_ = float(val) elif type(val)==int or type(val)==float: self.timestamp_ = val elif isinstance(val, datetime.datetime): @@ -356,7 +361,7 @@ def __str__(self): return self.iso8601() or "" def __repr__(self): - return self.iso8601() or "None" + return repr(self.iso8601()) or "None" def __le__(self,b): if b is None: return None return self.iso8601().__le__(b.iso8601()) @@ -601,7 +606,7 @@ def ext(self): """Extension, as a lowercase string without the leading '.'""" - import os, string + import string (base,ext) = os.path.splitext(self.filename()) if ext == '': return None @@ -699,9 +704,17 @@ return self.name_type()=='r' or self.name_type()==None def inode(self): - """Inode; may be a number or SleuthKit x-y-z formatr""" + """Inode; may be a number or SleuthKit x-y-z format""" return self.tag("inode") + def allocated_inode(self): + """Returns True if the file's inode data structure is allocated, False otherwise. (Does not return None.)""" + return isone(self.tag("alloc_inode")) + + def allocated_name(self): + """Returns True if the file's name data structure is allocated, False otherwise. (Does not return None.)""" + return isone(self.tag("alloc_name")) + def allocated(self): """Returns True if the file is allocated, False if it was not (that is, if it was deleted or is an orphan). @@ -709,7 +722,10 @@ We also need to tolerate the case of the unalloc tag being used. """ if self.filename()=="$OrphanFiles": return False - return isone(self.tag("alloc")) or isone(self.tag("ALLOC")) or not isone(self.tag("unalloc")) + if self.allocated_inode() and self.allocated_name(): + return True + else: + return isone(self.tag("alloc")) or isone(self.tag("ALLOC")) or not isone(self.tag("unalloc")) def compressed(self): if not self.has_tag("compressed") and not self.has_tag("compressed") : return False @@ -1162,10 +1178,28 @@ self.imageobject = imageobject_sax() self.imagefile = imagefile self.flags = flags + self._sax_fi_pointer = None xml_reader.__init__(self) + + @property + def _sax_fi_pointer(self): + """ + This internal field of a fileobject_reader is a simple state machine. A DFXML stream can contain fileobjects which contain original_fileobjects, which require the same parsing mechanisms. This pointer saves on duplicating code with the SAX parser. + + Type: None, or dfxml.fileobject. Type enforced by the setter method. + """ + return self._sax_fi_pointer_ + @_sax_fi_pointer.setter + def _sax_fi_pointer(self, val): + if val is None: + self._sax_fi_pointer_ = None + else: + assert isinstance(val, fileobject) + self._sax_fi_pointer_ = val def _start_element(self, name, attrs): """ Handles the start of an element for the XPAT scanner""" + _logger.debug("fileobject_reader._start_element: name = %r" % name) self.tagstack.append(name) self.cdata = "" # new element, so reset the data if name=="volume": @@ -1180,6 +1214,12 @@ if name=="fileobject": self.fileobject = fileobject_sax(imagefile=self.imagefile) self.fileobject.volume = self.volumeobject + self._sax_fi_pointer = self.fileobject + return + if name=="original_fileobject": + self.fileobject.original_fileobject = fileobject_sax(imagefile=self.imagefile) + #self.original_fileobject.volume = self.volumeobject #TODO + self._sax_fi_pointer = self.fileobject.original_fileobject return if name=='hashdigest': self.hashdigest_type = attrs['type'] @@ -1191,7 +1231,7 @@ def _end_element(self, name): - """Handles the end of an eleement for the XPAT scanner""" + """Handles the end of an element for the XPAT scanner""" assert(self.tagstack.pop()==name) # make sure that the stack matches if name=="volume": self.volumeobject = None @@ -1207,18 +1247,22 @@ self.fi_history.append(self.fileobject) self.fileobject = None return + if name=="original_fileobject": + self._sax_fi_pointer = self.fileobject + return if name=='hashdigest' and len(self.tagstack)>0: top = self.tagstack[-1] # what the hash was for alg = self.hashdigest_type.lower() # name of the hash algorithm used if top=='byte_run': - self.fileobject._byte_runs[-1].hashdigest[alg] = self.cdata - if top=="fileobject": - self.fileobject._tags[alg] = self.cdata # legacy - self.fileobject.hashdigest[alg] = self.cdata + self._sax_fi_pointer._byte_runs[-1].hashdigest[alg] = self.cdata + if top in ["fileobject", "original_fileobject"]: + self._sax_fi_pointer._tags[alg] = self.cdata # legacy + self._sax_fi_pointer.hashdigest[alg] = self.cdata self.cdata = None return - if self.fileobject: # in a file object, all tags are remembered - self.fileobject._tags[name] = self.cdata + + if self._sax_fi_pointer: # in file objects, all tags are remembered + self._sax_fi_pointer._tags[name] = self.cdata self.cdata = None return # Special case: fn diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/Extractor.py tcpflow-1.4.5+repack1/src/dfxml/python/Extractor.py --- tcpflow-1.4.4+repack1/src/dfxml/python/Extractor.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/Extractor.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 + +__version__ = "0.4.0" + +import Objects +import os +import sys +import logging +import hashlib +import copy +import traceback +import dfxml + +_logger = logging.getLogger(os.path.basename(__file__)) + +XMLNS_EXTRACTOR = "#Extractor.py" + +def is_alloc_and_uncompressed(obj): + if obj.compressed: + return False + if not obj.alloc_inode is None and not obj.alloc_name is None: + return obj.alloc_inode and obj.alloc_name + return obj.alloc + +def is_file(obj): + if is_alloc_and_uncompressed(obj) != True: + return False + if obj.filename is None: + return None + return obj.name_type == "r" + +def is_jpeg(obj): + if is_alloc_and_uncompressed(obj) != True: + return False + if obj.filename is None: + return None + if is_file(obj) != True: + return False + return obj.filename.lower().endswith(("jpg","jpeg")) + +def name_with_part_path(fobj): + retval = fobj.filename + if fobj.partition is None: + retval = os.path.join("no_partition", retval) + else: + retval = os.path.join("partition_" + str(fobj.partition), retval) + return retval + +def extract_files(image_path, outdir, dfxml_path=None, file_predicate=is_file, file_name=name_with_part_path, dry_run=None, out_manifest_path=None, err_manifest_path=None, keep_going=False): + """ + @param file_name Unary function. Takes a Objects.FileObject; returns the file path to which this file will be extracted, relative to outdir. So, if outdir="extraction" and the name_with_part_path function of this module is used, the file "/Users/Administrator/ntuser.dat" in partition 1 will be extracted to "extraction/partition_1/Users/Administrator/ntuser.dat". + """ + + extraction_byte_tally = 0 + + _path_for_iterparse = dfxml_path or image_path + + #Set up base manifest to track extracted files + base_manifest = Objects.DFXMLObject() + base_manifest.command_line = " ".join(sys.argv) + base_manifest.version = "1.1.0+" + base_manifest.add_namespace("extractor", XMLNS_EXTRACTOR) + base_manifest.add_namespace("delta", dfxml.XMLNS_DELTA) + base_manifest.sources.append(image_path) + if dfxml_path: + base_manifest.sources.append(dfxml_path) + + #Clone base manifest to all-files' manifest and errors-only manifest + out_manifest = None + if out_manifest_path: + out_manifest = copy.deepcopy(base_manifest) + err_manifest = None + if err_manifest_path: + err_manifest = copy.deepcopy(base_manifest) + + for (event, obj) in Objects.iterparse(_path_for_iterparse): + #Absolute prerequisites: + if not isinstance(obj, Objects.FileObject): + continue + + #Invoker prerequisites + if not file_predicate(obj): + continue + + extraction_entry = Objects.FileObject() + extraction_entry.original_fileobject = obj + + #Construct path where the file will be extracted + extraction_write_path = os.path.join(outdir, file_name(obj)) + + #Extract idempotently + if os.path.exists(extraction_write_path): + _logger.debug("Skipping already-extracted file: %r. Extraction path already exists: %r." % (obj.filename, extraction_write_path)) + continue + + extraction_entry.filename = extraction_write_path + + #Set up checksum verifier + checker = None + checked_byte_tally = 0 + if obj.sha1: + checker = hashlib.sha1() + + extraction_byte_tally += obj.filesize + + any_error = None + tsk_error = None + if not dry_run: + extraction_write_dir = os.path.dirname(extraction_write_path) + if not os.path.exists(extraction_write_dir): + os.makedirs(extraction_write_dir) + _logger.debug("Extracting to: %r." % extraction_write_path) + with open(extraction_write_path, "wb") as extraction_write_fh: + try: + for chunk in obj.extract_facet("content", image_path): + if checker: + checker.update(chunk) + checked_byte_tally += len(chunk) + extraction_write_fh.write(chunk) + + if checked_byte_tally != obj.filesize: + any_error = True + extraction_entry.filesize = checked_byte_tally + extraction_entry.diffs.add("filesize") + _logger.error("File size mismatch on %r." % obj.filename) + _logger.info("Recorded filesize = %r" % obj.filesize) + _logger.info("Extracted bytes = %r" % checked_byte_tally) + if checker and (obj.sha1 != checker.hexdigest()): + any_error = True + extraction_entry.sha1 = checker.hexdigest() + extraction_entry.diffs.add("sha1") + _logger.error("Hash mismatch on %r." % obj.filename) + _logger.info("Recorded SHA-1 = %r" % obj.sha1) + _logger.info("Computed SHA-1 = %r" % checker.hexdigest()) + #_logger.debug("File object: %r." % obj) + except Exception as e: + any_error = True + tsk_error = True + extraction_entry.error = "".join(traceback.format_stack()) + if e.args: + extraction_entry.error += "\n" + str(e.args) + if out_manifest: + out_manifest.append(extraction_entry) + if err_manifest and any_error: + err_manifest.append(extraction_entry) + if tsk_error and not keep_going: + _logger.warning("Terminating extraction loop early, due to encountered error.") + break + + #Report + _logger.info("Estimated extraction: %d bytes." % extraction_byte_tally) + if not out_manifest is None: + with open(out_manifest_path, "w") as out_manifest_fh: + out_manifest.print_dfxml(out_manifest_fh) + if not err_manifest is None: + tally = 0 + for obj in err_manifest: + if isinstance(obj, Objects.FileObject): + tally += 1 + _logger.info("Encountered errors extracting %d files." % tally) + with open(err_manifest_path, "w") as err_manifest_fh: + err_manifest.print_dfxml(err_manifest_fh) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("--dry-run", action="store_true", help="Do not write files to disk. Only verifies computed vs. stored checksums of file content.") + parser.add_argument("-x", "--xml", help="Pre-computed DFXML file. If not supplied, Fiwalk is called on the image argument.") + parser.add_argument("-k", "--keep-going", action="store_true", help="If a SleuthKit process error is encountered in extracting any file (note: this excludes checksum mismatches), the extraction halts unless this flag is passed.") + parser.add_argument("--output-manifest", help="Path for recording DFXML manifest of all extracted files.") + parser.add_argument("--error-manifest", help="Path for recording DFXML manifest of only files extracted with errors.") + parser.add_argument("image", help="Subject disk image from which files will be extracted.") + parser.add_argument("output_directory", help="Target output directory. Can already exist.") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + extract_files(args.image, args.output_directory, args.xml, is_file, name_with_part_path, args.dry_run, args.output_manifest, args.error_manifest, args.keep_going) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/hash_sectors.py tcpflow-1.4.5+repack1/src/dfxml/python/hash_sectors.py --- tcpflow-1.4.4+repack1/src/dfxml/python/hash_sectors.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/hash_sectors.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,184 @@ + +__version__ = "0.3.0" + +import Objects +import logging +import os +import hashlib +import sqlite3 + +_logger = logging.getLogger(os.path.basename(__file__)) + +_nagged_ids = False +_used_ids = set() +_last_id = 1 + +def _generate_id(): + """ + Creates an ID number unique to all DFXML documents ran through write_sector_hashes_to_db in this process. + """ + global _used_ids + global _last_id + while _last_id in _used_ids: + _last_id += 1 + _used_ids.add(_last_id) + return _last_id + +sql_schema_files = """CREATE TABLE files( + obj_id INTEGER NOT NULL, + partition INTEGER, + inode INTEGER, + filename TEXT, + filesize INTEGER +);""" +sql_schema_block_hashes = """CREATE TABLE block_hashes( + obj_id INTEGER NOT NULL, + img_offset INTEGER, + fs_offset INTEGER, + file_offset INTEGER, + len INTEGER NOT NULL, + md5 TEXT, + sha1 TEXT +);""" + +def write_sector_hashes_to_db(raw_image, dfxml_doc, predicate, db_output_path, pad_sectors=False): + """ + Produces sector hashes of all files that fit a predicate. + Predicate function: Takes a FileObject as input; returns True if the FileObject should have its sectors hashed (if possible). + """ + global _used_ids + + if os.path.exists(db_output_path): + raise ValueError("Database output path exists. Aborting - will not overwrite. (Path: %r.)" % db_output_path) + + conn = sqlite3.connect(db_output_path) + conn.isolation_level = "EXCLUSIVE" + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(sql_schema_files) + cursor.execute(sql_schema_block_hashes) + conn.commit() + + for (obj_no, obj) in enumerate(dfxml_doc): + if not isinstance(obj, Objects.FileObject): + continue + if not predicate(obj): + continue + brs = obj.data_brs + if brs is None: + continue + if obj.id is None: + if not _nagged_ids: + _logger.info("At least one FileObject had a null .id property. Generating IDs.") + _nagged_ids = True + obj.id = _generate_id() + else: + if obj.id in _used_ids: + _logger.warning("ID reuse: %r." % obj.id) + _used_ids.add(obj.id) + try: + file_offset = 0 + cursor.execute("INSERT INTO files(obj_id, partition, inode, filename, filesize) VALUES (?,?,?,?,?);", ( + obj.id, + obj.partition, + obj.inode, + obj.filename, + obj.filesize + )) + found_incomplete_chunk = False + for chunk in brs.iter_contents(raw_image, buffer_size=512): + if found_incomplete_chunk: + _logger.debug("File with unexpected mid-stream incomplete byte run: %r." % obj) + raise ValueError("Found incomplete sector in middle of byte_runs list.") + md5obj = hashlib.md5() + sha1obj = hashlib.sha1() + + md5obj.update(chunk) + sha1obj.update(chunk) + + if pad_sectors and len(chunk) < 512: + found_incomplete_chunk = True + remainder = 512 - len(chunk) + nulls = remainder * b"0" + md5obj.update(nulls) + sha1obj.update(nulls) + + #TODO No img_offset or fs_offset for now; could be done with a little byte_runs offset acrobatics, or a request to restore sector hash records in DFXML. + cursor.execute("INSERT INTO block_hashes(obj_id, img_offset, fs_offset, file_offset, len, md5, sha1) VALUES (?,?,?,?,?,?,?);", ( + obj.id, + None, + None, + file_offset, + len(chunk), + md5obj.hexdigest(), + sha1obj.hexdigest() + )) + + file_offset += len(chunk) + if not obj.filesize is None and file_offset != obj.filesize: + _logger.warning("The hashed blocks' lengths do not sum to the filesize recorded: respectively, %d and %d. File ID %r." % (file_offset, obj.filesize, obj.id)) + except AttributeError as e: + #Some files' contents can't be accessed straightforwardly. Note and skip. + _logger.error(e.args[0] + (" File ID %r." % obj.id)) + _logger.debug("The problem FileObject: %r." % obj) + + #Commit every thousand files + if obj_no % 1000 == 999: + _logger.debug("Committing hashes of object number %d." % obj_no) + conn.commit() + conn.commit() + conn.close() + +def is_allocated(fobj): + if fobj.alloc_name and fobj.alloc_inode: + return True + elif fobj.alloc: + return True + return False + +def is_new_file(fobj): + if not fobj.annos is None: + return "new" in fobj.annos + return None + +def is_mod_file(fobj): + if not fobj.annos is None: + return "modified" in fobj.annos + return None + +def is_new_or_mod_file(fobj): + return is_new_file(fobj) or is_mod_file(fobj) + +def main(): + predicates = { + "all": (lambda x: True), + "allocated": is_allocated, + "new": is_new_file, + "mod": is_mod_file, + "newormod": is_new_or_mod_file + } + if args.predicate is None: + args.predicate = "new" + if args.predicate not in predicates: + raise ValueError("--predicate must be from this list: %r. Received: %r." % (predicates.keys(), args.predicate)) + + if args.xml: + d = Objects.parse(args.xml) + else: + d = Objects.parse(args.disk_image) + write_sector_hashes_to_db(args.disk_image, d, is_allocated, args.db_output, args.pad) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Walks a file system and outputs sector hashes of all files matching a predicate. Can be used as a library for the function write_sector_hashes_to_db.") + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("-x", "--xml", help="Pre-computed DFXML.") + parser.add_argument("--predicate", help="Condition for selecting files to sector hash. One of 'new', 'allocated', 'all', 'mod'(ified), 'newormod'. Default 'allocated'.") + parser.add_argument("--pad", help="Pad non-full sectors with null bytes.", action="store_true") + parser.add_argument("disk_image") + parser.add_argument("db_output") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/idifference2.py tcpflow-1.4.5+repack1/src/dfxml/python/idifference2.py --- tcpflow-1.4.4+repack1/src/dfxml/python/idifference2.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/idifference2.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,97 @@ +#/usr/bin/env python3 + +""" +DEVELOPMENT NOTE: This implementation will soon be replaced by what is currently idifference2.py, after a period of testing by users. If idifference2.py does not meet your needs, but idifference.py does, please let one of the maintainers know (email addresses in the Git history or the python/ChangeLog file). +""" + +__version__ = "2.0.0alpha" + +import Objects +import make_differential_dfxml +import summarize_differential_dfxml + +import logging +import os + +_logger = logging.getLogger(os.path.basename(__file__)) + +def ignorable_name(fn): + """Filter out recognized pseudo-file names, accomodating user request for including dotdirs.""" + if fn is None: + return False + if args.include_dotdirs and os.path.basename(fn) in [".", ".."]: + return False + return make_differential_dfxml.ignorable_name(fn) + +if __name__=="__main__": + import argparse + + parser = argparse.ArgumentParser(description='%prog [options] file1 file2 (files can be xml or image files)') + returningsoon = parser.add_argument_group("Returning soon", "Some of the options in idifference were not carried forward in the reimplementation. Please feel free to request these features be re-implemented if you need them.") + parser.add_argument("-d","--debug",help="Enable debug printing",action='store_true') + parser.add_argument("-x","--xml",help="Specify output file for DFXML manifest of differences",dest="xmlfilename") + parser.add_argument("--include-dotdirs",help="Include files with names ending in '/.' and '/..'",action="store_true", default=False) + parser.add_argument("--sort-by", help="Sorts reported file lists. Pass one of these arguments: \"times\" or \"paths\".") + parser.add_argument("--summary",help="output summary statistics of file system changes",action="store_true", default=False) + parser.add_argument("--timestamp",help="output all times in Unix timestamp format; otherwise use ISO 8601",action="store_true") + + returningsoon.add_argument("-n","--notimeline",help="do not generate a timeline",action="store_true") + returningsoon.add_argument("-T","--tararchive",help="create tar archive file of new/changed files",dest="tarfile") + returningsoon.add_argument("-Z","--zipfile",help="create ZIP64 archive file of new/changed files",dest="zipfile") + returningsoon.add_argument("--html",help="specify output in HTML",action="store_true") + returningsoon.add_argument("--noatime",help="Do not include atime changes",action="store_true") + returningsoon.add_argument("--imagefile",help="specifies imagefile or file2 is an XML file and you are archiving") + + parser.add_argument("infiles", nargs="+") + + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + if len(args.infiles) != 2: + raise NotImplementedError("Sorry, but this version of idifference can only run on two disk images, not a longer sequence. Please feel free to request longer sequences be re-implemented if you need it.") + + if args.tarfile: + raise NotImplementedError("Sorry, but the tarring argument was not carried forward in the re-implementation. Please feel free to request this feature be re-implemented if you need it.") + + if args.zipfile: + raise NotImplementedError("Sorry, but the zipping argument was not carried forward in the re-implementation. Please feel free to request this feature be re-implemented if you need it.") + #TODO The Extractor program should get a Zip-handling function to handle this flag. + + if args.html: + raise NotImplementedError("Sorry, but the HTML output argument was not carried forward in the re-implementation. Please feel free to request this feature be re-implemented if you need it.") + + if args.noatime: + raise NotImplementedError("Sorry, but the ignore-atime argument was not carried forward in the re-implementation. Please feel free to request this feature be re-implemented if you need it.") + + if args.notimeline: + raise NotImplementedError("Sorry, but the notimeline argument was not carried forward in the re-implementation. Please feel free to request this feature be re-implemented if you need it.") + + if args.imagefile: + raise NotImplementedError("Sorry, but the imagefile argument was not carried forward in the re-implementation. Please feel free to request this feature be re-implemented if you need it.") + + pre = None + post = None + for infile in args.infiles: + pre = post + post = infile + + _logger.info(">>> Reading %s." % infile) + + if not pre is None: + diffdfxml = make_differential_dfxml.make_differential_dfxml( + pre, + post, + diff_mode="idifference", + ignore_filename_function=ignorable_name + ) + if args.xmlfilename: + _logger.debug("Opening temp file for writing.") + with open(args.xmlfilename, "w") as fh: + diffdfxml.print_dfxml(output_fh=fh) + summarize_differential_dfxml.report( + diffdfxml, + sort_by=args.sort_by, + summary=args.summary, + timestamp=args.timestamp + ) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/idifference.py tcpflow-1.4.5+repack1/src/dfxml/python/idifference.py --- tcpflow-1.4.4+repack1/src/dfxml/python/idifference.py 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/idifference.py 2015-08-26 03:35:59.000000000 +0000 @@ -1,6 +1,8 @@ #!/usr/bin/env python """idifference. +DEVELOPMENT NOTE: This implementation will soon be replaced by what is currently idifference2.py, after a period of testing by users. If idifference2.py does not meet your needs, but idifference.py does, please let one of the maintainers know (email addresses in the Git history or the python/ChangeLog file). + Generates a report about what's different between two disk images. Process: @@ -18,6 +20,7 @@ import sys,fiwalk,dfxml,time import copy +import logging if sys.version_info < (3,1): raise RuntimeError("idifference.py now requires Python 3.1 or above") @@ -169,6 +172,10 @@ def process_fi(self,fi): global options + # Filter out specific filenames create by TSK that are not of use + if ignore_filename(fi.filename(), self.include_dotdirs): + return + dprint("processing %s" % str(fi)) # See if the filename changed its hash code @@ -176,10 +183,6 @@ if not fi.allocated(): return # only look at allocated files - # Filter out specific filenames create by TSK that are not of use - if ignore_filename(fi.filename(), self.include_dotdirs): - return - # Remember the file for the next generation self.new_fnames[fi.filename()] = fi self.new_inodes[(fi.partition(), fi.inode())] = fi @@ -577,6 +580,8 @@ (options,args) = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if options.debug else logging.INFO) + if len(args)<1: parser.print_help() sys.exit(1) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/make_differential_dfxml.py tcpflow-1.4.5+repack1/src/dfxml/python/make_differential_dfxml.py --- tcpflow-1.4.4+repack1/src/dfxml/python/make_differential_dfxml.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/make_differential_dfxml.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 + +""" +make_differential_dfxml.py + +Takes two DFXML files as input. +Produces a differential DFXML file as output. + +This program's main purpose is matching files correctly. It only performs enough analysis to determine that a fileobject has changed at all. (This is half of the work done by idifference.py.) +""" + +__version__ = "0.10.1" + +import Objects +import logging +import xml.etree.ElementTree as ET +import os +import sys +import collections +import dfxml + +_logger = logging.getLogger(os.path.basename(__file__)) + +def _lower_ftype_str(vo): + """The string labels of file system names might differ by something small like the casing. Normalize the labels by lower-casing them.""" + Objects._typecheck(vo, Objects.VolumeObject) + f = vo.ftype_str + if isinstance(f, str): f = f.lower() + return f + +def ignorable_name(fn): + """Filter out recognized pseudo-file names.""" + if fn is None: + return False + return os.path.basename(fn) in [".", "..", "$FAT1", "$FAT2", "$OrphanFiles"] + +def make_differential_dfxml(pre, post, **kwargs): + """ + Takes as input two paths to DFXML files. Returns a DFXMLObject. + @param pre String. + @param post String. + @param diff_mode Optional. One of "all" or "idifference". + @param retain_unchanged Optional. Boolean. + @param ignore_properties Optional. Set. + @param annotate_matches Optional. Boolean. True -> matched file objects get a "delta:matched='1'" attribute. + @param rename_requires_hash Optional. Boolean. True -> all matches require matching SHA-1's, if present. + @param ignore_filename_function Optional. Function, string -> Boolean. Returns True if a file name (which can be null) should be ignored. + @param glom_byte_runs Optional. Boolean. Joins contiguous-region byte runs together in FileObject byte run lists. + """ + + diff_mode = kwargs.get("diff_mode", "all") + retain_unchanged = kwargs.get("retain_unchanged", False) + ignore_properties = kwargs.get("ignore_properties", set()) + annotate_matches = kwargs.get("annotate_matches", False) + rename_requires_hash = kwargs.get("rename_requires_hash", False) + ignore_filename_function = kwargs.get("ignore_filename_function", ignorable_name) + glom_byte_runs = kwargs.get("glom_byte_runs", False) + + _expected_diff_modes = ["all", "idifference"] + if diff_mode not in _expected_diff_modes: + raise ValueError("Differencing mode should be in: %r." % _expected_diff_modes) + diff_mask_set = set() + diff_ignore_set = set() + + if diff_mode == "idifference": + diff_mask_set |= set([ + "atime", + "byte_runs", + "crtime", + "ctime", + "filename", + "filesize", + "md5", + "mtime", + "sha1" + ]) + diff_ignore_set |= ignore_properties + _logger.debug("diff_mask_set = " + repr(diff_mask_set)) + _logger.debug("diff_ignore_set = " + repr(diff_ignore_set)) + + + #d: The container DFXMLObject, ultimately returned. + d = Objects.DFXMLObject(version="1.1.0") + d.command_line = " ".join(sys.argv) + d.add_namespace("delta", dfxml.XMLNS_DELTA) + d.dc["type"] = "Disk image difference set" + + #The list most of this function is spent on building + fileobjects_changed = [] + + #Unmodified files; only retained if requested. + fileobjects_unchanged = [] + + #Key: (partition, inode, filename); value: FileObject + old_fis = None + new_fis = None + + #Key: (partition, inode, filename); value: FileObject list + old_fis_unalloc = None + new_fis_unalloc = None + + #Key: Partition byte offset within the disk image, paired with the file system type + #Value: VolumeObject + old_volumes = None + new_volumes = None + matched_volumes = dict() + + #Populated in distinct (offset, file system type as string) encounter order + volumes_encounter_order = dict() + + for infile in [pre, post]: + + _logger.debug("infile = %r" % infile) + old_fis = new_fis + new_fis = dict() + + old_volumes = new_volumes + new_volumes = dict() + #Fold in the matched volumes - we're just discarding the deleted volumes + for k in matched_volumes: + old_volumes[k] = matched_volumes[k] + matched_volumes = dict() + + old_fis_unalloc = new_fis_unalloc + new_fis_unalloc = collections.defaultdict(list) + + d.sources.append(infile) + + for (i, (event, new_obj)) in enumerate(Objects.iterparse(infile)): + if isinstance(new_obj, Objects.DFXMLObject): + #Inherit desired properties from the source DFXMLObject. + + #Inherit namespaces + for (prefix, url) in new_obj.iter_namespaces(): + d.add_namespace(prefix, url) + + continue + elif isinstance(new_obj, Objects.VolumeObject): + if event == "end": + #This algorithm doesn't yet need to know when a volume is concluded. On to the next object. + continue + + offset = new_obj.partition_offset + if offset is None: + raise AttributeError("To perform differencing with volumes, the elements must have a . Either re-generate your DFXML with partition offsets, or run this program again with the --ignore-volumes flag.") + + #Use the lower-case volume spelling + ftype_str = _lower_ftype_str(new_obj) + + #Re-capping the general differential analysis algorithm: + #0. If the volume is in the new list, something's gone wrong. + if (offset, ftype_str) in new_volumes: + _logger.debug("new_obj.partition_offset = %r." % offset) + _logger.warning("Encountered a volume that starts at an offset as another volume, in the same disk image. This analysis is based on the assumption that that doesn't happen. Check results that depend on partition mappings.") + + #1. If the volume is in the old list, pop it out of the old list - it's matched. + if old_volumes and (offset, ftype_str) in old_volumes: + _logger.debug("Found a volume in post image, at offset %r." % offset) + old_obj = old_volumes.pop((offset, ftype_str)) + new_obj.original_volume = old_obj + new_obj.compare_to_original() + matched_volumes[(offset, ftype_str)] = new_obj + + #2. If the volume is NOT in the old list, add it to the new list. + else: + _logger.debug("Found a new volume, at offset %r." % offset) + new_volumes[(offset, ftype_str)] = new_obj + volumes_encounter_order[(offset, ftype_str)] = len(new_volumes) + ((old_volumes and len(old_volumes)) or 0) + len(matched_volumes) + + #3. Afterwards, the old list contains deleted volumes. + + #Record the ID + new_obj.id = volumes_encounter_order[(offset, ftype_str)] + + #Move on to the next object + continue + elif not isinstance(new_obj, Objects.FileObject): + #The rest of this loop compares only file objects. + continue + + if ignorable_name(new_obj.filename): + continue + + #Simplify byte runs if requested + if glom_byte_runs: + if new_obj.byte_runs: + temp_byte_runs = Objects.ByteRuns() + for run in new_obj.byte_runs: + temp_byte_runs.glom(run) + new_obj.byte_runs = temp_byte_runs + + #Normalize the partition number + if new_obj.volume_object is None: + new_obj.partition = None + else: + vo = new_obj.volume_object + fts = _lower_ftype_str(vo) + new_obj.partition = volumes_encounter_order[(vo.partition_offset, fts)] + + #Define the identity key of this file -- affected by the --ignore argument + _key_partition = None if "partition" in ignore_properties else new_obj.partition + _key_inode = None if "inode" in ignore_properties else new_obj.inode + _key_filename = None if "filename" in ignore_properties else new_obj.filename + key = (_key_partition, _key_inode, _key_filename) + + #Ignore unallocated content comparisons until a later loop. The unique identification of deleted files needs a little more to work. + if not new_obj.alloc: + new_fis_unalloc[key].append(new_obj) + continue + + #The rest of this loop is irrelevant until the second file. + if old_fis is None: + new_fis[key] = new_obj + continue + + + if key in old_fis: + #Extract the old fileobject and check for changes + old_obj = old_fis.pop(key) + new_obj.original_fileobject = old_obj + new_obj.compare_to_original() + + #_logger.debug("Diffs: %r." % _diffs) + _diffs = new_obj.diffs - diff_ignore_set + #_logger.debug("Diffs after ignore-set: %r." % _diffs) + if diff_mask_set: + _diffs &= diff_mask_set + #_logger.debug("Diffs after mask-set: %r." % _diffs) + + if len(_diffs) > 0: + #_logger.debug("Remaining diffs: " + repr(_diffs)) + fileobjects_changed.append(new_obj) + else: + #Unmodified file; only keep if requested. + if retain_unchanged: + fileobjects_unchanged.append(new_obj) + else: + #Store the new object + new_fis[key] = new_obj + + #The rest of the files loop is irrelevant until the second file. + if old_fis is None: + continue + + + _logger.debug("len(old_fis) = %d" % len(old_fis)) + _logger.debug("len(new_fis) = %d" % len(new_fis)) + _logger.debug("len(fileobjects_changed) = %d" % len(fileobjects_changed)) + + #Identify renames - only possible if 1-to-1. Many-to-many renames are just left as new and deleted files. + _logger.debug("Detecting renames...") + fileobjects_renamed = [] + def _make_name_map(d): + """Returns a dictionary, mapping (partition, inode) -> {filename}.""" + retdict = collections.defaultdict(lambda: set()) + for (partition, inode, filename) in d.keys(): + retdict[(partition, inode)].add(filename) + return retdict + old_inode_names = _make_name_map(old_fis) + new_inode_names = _make_name_map(new_fis) + for key in new_inode_names.keys(): + (partition, inode) = key + + if len(new_inode_names[key]) != 1: + continue + if not key in old_inode_names: + continue + if len(old_inode_names[key]) != 1: + continue + if rename_requires_hash: + #Peek at the set elements by doing a quite-ephemeral list cast + old_obj = old_fis[(partition, inode, list(old_inode_names[key])[0])] + new_obj = new_fis[(partition, inode, list(new_inode_names[key])[0])] + if old_obj.sha1 != new_obj.sha1: + continue + + #Found a match if we're at this point in the loop + old_name = old_inode_names[key].pop() + new_name = new_inode_names[key].pop() + old_obj = old_fis.pop((partition, inode, old_name)) + new_obj = new_fis.pop((partition, inode, new_name)) + new_obj.original_fileobject = old_obj + new_obj.compare_to_original() + fileobjects_renamed.append(new_obj) + _logger.debug("len(old_fis) -> %d" % len(old_fis)) + _logger.debug("len(new_fis) -> %d" % len(new_fis)) + _logger.debug("len(fileobjects_changed) -> %d" % len(fileobjects_changed)) + _logger.debug("len(fileobjects_renamed) = %d" % len(fileobjects_renamed)) + + #Identify files that just changed inode number - basically, doing the rename detection again + _logger.debug("Detecting inode number changes...") + def _make_inode_map(d): + """Returns a dictionary, mapping (partition, filename) -> inode.""" + retdict = dict() + for (partition, inode, filename) in d.keys(): + if (partition, filename) in retdict: + _logger.warning("Multiple instances of the file path %r were found in partition %r; this violates an assumption of this program, that paths are unique within partitions." % (filename, partition)) + retdict[(partition, filename)] = inode + return retdict + old_name_inodes = _make_inode_map(old_fis) + new_name_inodes = _make_inode_map(new_fis) + for key in new_name_inodes.keys(): + if not key in old_name_inodes: + continue + (partition, name) = key + old_obj = old_fis.pop((partition, old_name_inodes[key], name)) + new_obj = new_fis.pop((partition, new_name_inodes[key], name)) + new_obj.original_fileobject = old_obj + new_obj.compare_to_original() + fileobjects_changed.append(new_obj) + _logger.debug("len(old_fis) -> %d" % len(old_fis)) + _logger.debug("len(new_fis) -> %d" % len(new_fis)) + _logger.debug("len(fileobjects_changed) -> %d" % len(fileobjects_changed)) + #And that's the end of the allocated-only, per-volume analysis. + + #We may be able to match files that aren't allocated against files we think are deleted + _logger.debug("Detecting modifications from unallocated files...") + fileobjects_deleted = [] + for key in new_fis_unalloc: + #1 partition; 1 inode number; 1 name, repeated: Too ambiguous to compare. + if len(new_fis_unalloc[key]) != 1: + continue + + if key in old_fis_unalloc: + if len(old_fis_unalloc[key]) == 1: + #The file was unallocated in the previous image, too. + old_obj = old_fis_unalloc[key].pop() + new_obj = new_fis_unalloc[key].pop() + new_obj.original_fileobject = old_obj + new_obj.compare_to_original() + #The file might not have changed. It's interesting if it did, though. + + _diffs = new_obj.diffs - diff_mask_set + #_logger.debug("Diffs: %r." % _diffs) + _diffs = new_obj.diffs - diff_ignore_set + #_logger.debug("Diffs after ignore-set: %r." % _diffs) + if diff_mask_set: + _diffs &= diff_mask_set + #_logger.debug("Diffs after mask-set: %r." % _diffs) + if len(_diffs) > 0: + _logger.debug("Remaining diffs: " + repr(_diffs)) + fileobjects_changed.append(new_obj) + elif retain_unchanged: + fileobjects_unchanged.append(new_obj) + elif key in old_fis: + #Identified a deletion. + old_obj = old_fis.pop(key) + new_obj = new_fis_unalloc[key].pop() + new_obj.original_fileobject = old_obj + new_obj.compare_to_original() + fileobjects_deleted.append(new_obj) + _logger.debug("len(old_fis) -> %d" % len(old_fis)) + _logger.debug("len(new_fis) -> %d" % len(new_fis)) + _logger.debug("len(fileobjects_changed) -> %d" % len(fileobjects_changed)) + _logger.debug("len(fileobjects_deleted) -> %d" % len(fileobjects_deleted)) + + #After deletion matching is performed, one might want to look for files migrating to other partitions. + #However, since between-volume migration creates a new deleted file, this algorithm instead ignores partition migrations. + #AJN TODO Thinking about it a little more, I can't suss out a reason against trying this match. It's complicated if we try looking for reallocations in new_fis, strictly from new_fis_unalloc. + + #TODO We might also want to match the unallocated objects based on metadata addresses. Unfortunately, that requires implementation of additional byte runs, which hasn't been fully designed yet in the DFXML schema. + + #Begin output. + #First, annotate the volume objects. + for key in new_volumes: + v = new_volumes[key] + v.annos.add("new") + for key in old_volumes: + v = old_volumes[key] + v.annos.add("deleted") + for key in matched_volumes: + v = matched_volumes[key] + if len(v.diffs) > 0: + v.annos.add("modified") + + #Build list of FileObject appenders, child volumes of the DFXML Document. + #Key: Partition number, or None + #Value: Reference to the VolumeObject corresponding with that partition number. None -> the DFXMLObject. + appenders = dict() + for volume_dict in [new_volumes, matched_volumes, old_volumes]: + for (offset, ftype_str) in volume_dict: + if (offset, ftype_str) in appenders: + raise ValueError("This pair is already in the appenders dictionary, which was supposed to be distinct: " + repr((offset, ftype_str)) + ".") + v = volume_dict[(offset, ftype_str)] + appenders[volumes_encounter_order[(offset, ftype_str)]] = v + d.append(v) + + #Add in the default appender, the DFXML Document itself. + appenders[None] = d + + content_diffs = set(["md5", "sha1", "mtime"]) + + def _maybe_match_attr(obj): + """Just adds the 'matched' annotation when called.""" + if annotate_matches: + obj.annos.add("matched") + + #Populate DFXMLObject. + for key in new_fis: + #TODO If this script ever does a series of >2 DFXML files, these diff additions need to be removed for the next round. + fi = new_fis[key] + fi.annos.add("new") + appenders[fi.partition].append(fi) + for key in new_fis_unalloc: + for fi in new_fis_unalloc[key]: + fi.annos.add("new") + appenders[fi.partition].append(fi) + for fi in fileobjects_deleted: + #Independently flag for name, content, and metadata modifications + if len(fi.diffs - content_diffs) > 0: + fi.annos.add("changed") + if len(content_diffs.intersection(fi.diffs)) > 0: + fi.annos.add("modified") + if "filename" in fi.diffs: + fi.annos.add("renamed") + fi.annos.add("deleted") + _maybe_match_attr(fi) + appenders[fi.partition].append(fi) + for key in old_fis: + ofi = old_fis[key] + nfi = Objects.FileObject() + nfi.original_fileobject = ofi + nfi.annos.add("deleted") + appenders[ofi.partition].append(nfi) + for key in old_fis_unalloc: + for ofi in old_fis_unalloc[key]: + nfi = Objects.FileObject() + nfi.original_fileobject = ofi + nfi.annos.add("deleted") + appenders[ofi.partition].append(nfi) + for fi in fileobjects_renamed: + #Independently flag for content and metadata modifications + if len(content_diffs.intersection(fi.diffs)) > 0: + fi.annos.add("modified") + if len(fi.diffs - content_diffs) > 0: + fi.annos.add("changed") + fi.annos.add("renamed") + _maybe_match_attr(fi) + appenders[fi.partition].append(fi) + for fi in fileobjects_changed: + #Independently flag for content and metadata modifications + if len(content_diffs.intersection(fi.diffs)) > 0: + fi.annos.add("modified") + if len(fi.diffs - content_diffs) > 0: + fi.annos.add("changed") + _maybe_match_attr(fi) + appenders[fi.partition].append(fi) + for fi in fileobjects_unchanged: + _maybe_match_attr(fi) + appenders[fi.partition].append(fi) + + #Output + return d + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("--idifference-diffs", action="store_true", help="Only consider the modifications idifference had considered (names, hashes, timestamps).") + parser.add_argument("-i", "--ignore", action="append", help="Object property to ignore in all difference operations. E.g. pass '-i inode' to ignore inode differences when comparing directory trees on the same file system.") + parser.add_argument("--rename-with-hash", action="store_true", help="Require that renamed files must match on a content hash.") + parser.add_argument("--retain-unchanged", action="store_true", help="Output unchanged files in the resulting DFXML file.", default=False) + parser.add_argument("--annotate-matches", action="store_true", help="Add a 'dfxml:matched' Boolean attribute to every produced object. Useful for some counting purposes, but not always needed.", default=False) + parser.add_argument("--simplify-byte-runs", action="store_true", help="Join contiguous byte run elements together, if their attributes align.", default=False) + parser.add_argument("infiles", nargs="+") + args = parser.parse_args() + + #TODO Add --vignore to ignore volume properties, like ftype_str to compare only file system offsets for partitions + #TODO Switch --ignore to --fignore + #TODO Add --ignore-volumes. It should (probably) strip all volume information from each file. + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + if len(args.infiles) != 2: + raise ValueError("This script requires exactly two DFXML files as input.") + + pre = None + post = None + + if len(args.infiles) > 2: + raise NotImplementedError("This program only analyzes two files at the moment.") + + ignore_properties = set() + if not args.ignore is None: + for i in args.ignore: + ignore_properties.add(i) + + for infile in args.infiles: + pre = post + post = infile + if not pre is None: + print(make_differential_dfxml( + pre, + post, + diff_mode="idifference" if args.idifference_diffs else "all", + retain_unchanged=args.retain_unchanged, + ignore_properties=ignore_properties, + annotate_matches=args.annotate_matches, + rename_requires_hash=args.rename_with_hash + ).to_dfxml()) + diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/Makefile tcpflow-1.4.5+repack1/src/dfxml/python/Makefile --- tcpflow-1.4.4+repack1/src/dfxml/python/Makefile 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/Makefile 2015-08-26 03:35:59.000000000 +0000 @@ -3,8 +3,17 @@ check-cat_fileobjects \ check-dfxml_tool \ check-idifference-dfxml \ - check-mac_timelines + check-mac_timelines \ + check-Objects + @echo "" @echo "Tests passed!" + @echo "Clean up the test results with 'make clean'." + +doc: \ + Objects.html + +Objects.html: Objects.py + python3.3 -m pydoc -w Objects check-dfxml: ./test_dfxml.sh @@ -21,9 +30,15 @@ check-mac_timelines: ./test_mac_timelines.sh -clean: +check-Objects: + $(MAKE) -C test_Objects + +clean: clean-Objects rm -f dfxml_tool_*xml rm -f cat_test_*.dfxml rm -f idifference_test.txt rm -f idifference_test*.dfxml rm -f demo_mac_timeline*.txt + +clean-Objects: + $(MAKE) -C test_Objects clean diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/Objects.py tcpflow-1.4.5+repack1/src/dfxml/python/Objects.py --- tcpflow-1.4.4+repack1/src/dfxml/python/Objects.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/Objects.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,2638 @@ + +""" +This file re-creates the major DFXML classes with an emphasis on type safety, serializability, and de-serializability. + +With this module, reading disk images or DFXML files is done with the parse or iterparse functions. Writing DFXML files can be done with the DFXMLObject.print_dfxml function. +""" + +__version__ = "0.2.2" + +#Remaining roadmap to 1.0.0: +# * Documentation. +# * User testing. +# * Compatibility with the DFXML schema, version >1.1.0. + +import logging +import re +import copy +import xml.etree.ElementTree as ET +import subprocess +import dfxml +import os +import sys + +_logger = logging.getLogger(os.path.basename(__file__)) + +#Contains: (namespace, local name) qualified XML element name pairs +_warned_elements = set([]) +_warned_byterun_attribs = set([]) + +#Contains: Unexpected 'facet' values on byte_runs elements. +_warned_byterun_facets = set([]) + +#Issue some log statements only once per program invocation. +_nagged_alloc = False +_warned_byterun_badtypecomp = False + +def _ET_tostring(e): + """Between Python 2 and 3, there are some differences in the ElementTree library's tostring() behavior. One, the method balks at the "unicode" encoding in 2. Two, in 2, the XML prototype's output with every invocation. This method serves as a wrapper to deal with those issues.""" + if sys.version_info[0] < 3: + tmp = ET.tostring(e, encoding="UTF-8") + if tmp[0:2] == "\n")+3 : ] + else: + return tmp + else: + return ET.tostring(e, encoding="unicode") + +def _boolcast(val): + """Takes Boolean values, and 0 or 1 in string or integer form, and casts them all to Boolean. Preserves nulls. Balks at everything else.""" + if val is None: + return None + if val in [True, False]: + return val + + _val = val + if val in ["0", "1"]: + _val = int(val) + if _val in [0, 1]: + return _val == 1 + + _logger.debug("val = " + repr(val)) + raise ValueError("Received a not-straightforwardly-Boolean value. Expected some form of 0, 1, True, or False.") + +def _bytecast(val): + """Casts a value as a byte string. If a character string, assumes a UTF-8 encoding.""" + if val is None: + return None + if isinstance(val, bytes): + return val + return _strcast(val).encode("utf-8") + +def _intcast(val): + """Casts input integer or string to integer. Preserves nulls. Balks at everything else.""" + if val is None: + return None + if isinstance(val, int): + return val + + if isinstance(val, str): + if val[0] == "-": + if val[1:].isdigit(): + return int(val) + else: + if val.isdigit(): + return int(val) + + _logger.debug("val = " + repr(val)) + raise ValueError("Received a non-int-castable value. Expected an integer or an integer as a string.") + +def _read_differential_annotations(annodict, element, annoset): + """ + Uses the shorthand-to-attribute mappings of annodict to translate attributes of element into annoset. + """ + #_logger.debug("annoset, before: %r." % annoset) + #Start with inverting the dictionary + _d = { annodict[k].replace("delta:",""):k for k in annodict } + #_logger.debug("Inverted dictionary: _d = %r" % _d) + for attr in element.attrib: + #_logger.debug("Looking for differential annotations: %r" % element.attrib) + (ns, an) = _qsplit(attr) + if an in _d and ns == dfxml.XMLNS_DELTA: + #_logger.debug("Found; adding %r." % _d[an]) + annoset.add(_d[an]) + #_logger.debug("annoset, after: %r." % annoset) + +def _qsplit(tagname): + """Requires string input. Returns namespace and local tag name as a pair. I could've sworn this was a basic implementation gimme, but ET.QName ain't it.""" + _typecheck(tagname, str) + if tagname[0] == "{": + i = tagname.rfind("}") + return ( tagname[1:i], tagname[i+1:] ) + else: + return (None, tagname) + +def _strcast(val): + if val is None: + return None + return str(val) + +def _typecheck(obj, classinfo): + if not isinstance(obj, classinfo): + _logger.info("obj = " + repr(obj)) + if isinstance(classinfo, tuple): + raise TypeError("Expecting object to be one of the types %r." % (classinfo,)) + else: + raise TypeError("Expecting object to be of type %r." % classinfo) + +class DFXMLObject(object): + def __init__(self, *args, **kwargs): + self.command_line = kwargs.get("command_line") + self.version = kwargs.get("version") + self.sources = kwargs.get("sources", []) + self.dc = kwargs.get("dc", dict()) + + self._namespaces = dict() + self._volumes = [] + self._files = [] + + input_volumes = kwargs.get("volumes") or [] + input_files = kwargs.get("files") or [] + for v in input_volumes: + self.append(v) + for f in input_files: + self.append(f) + + #Add default namespaces + self.add_namespace("", dfxml.XMLNS_DFXML) + self.add_namespace("dc", dfxml.XMLNS_DC) + + def __iter__(self): + """Yields all VolumeObjects, recursively their FileObjects, and the FileObjects directly attached to this DFXMLObject, in that order.""" + for v in self._volumes: + yield v + for f in v: + yield f + for f in self._files: + yield f + + def add_namespace(self, prefix, url): + self._namespaces[prefix] = url + ET.register_namespace(prefix, url) + + def append(self, value): + if isinstance(value, VolumeObject): + self._volumes.append(value) + elif isinstance(value, FileObject): + self._files.append(value) + else: + _logger.debug("value = %r" % value) + raise TypeError("Expecting a VolumeObject or a FileObject. Got instead this type: %r." % type(value)) + + def iter_namespaces(self): + """Yields (prefix, url) pairs of each namespace registered in this DFXMLObject.""" + for prefix in self._namespaces: + yield (prefix, self._namespaces[prefix]) + + def populate_from_Element(self, e): + if "version" in e.attrib: + self.version = e.attrib["version"] + + for elem in e.findall(".//*"): + (ns, ln) = _qsplit(elem.tag) + if ln == "command_line": + self.command_line = elem.text + elif ln == "image_filename": + self.sources.append(elem.text) + + def print_dfxml(self, output_fh=sys.stdout): + """Memory-efficient DFXML document printer. However, it assumes the whole element tree is already constructed.""" + pe = self.to_partial_Element() + dfxml_wrapper = _ET_tostring(pe) + dfxml_foot = "" + #Check for an empty element + if dfxml_wrapper.strip()[-3:] == " />": + dfxml_head = dfxml_wrapper.strip()[:-3] + ">" + elif dfxml_wrapper.strip()[-2:] == "/>": + dfxml_head = dfxml_wrapper.strip()[:-2] + ">" + else: + dfxml_head = dfxml_wrapper.strip()[:-len(dfxml_foot)] + + output_fh.write("""\n""") + output_fh.write(dfxml_head) + output_fh.write("\n") + _logger.debug("Writing %d volume objects." % len(self._volumes)) + for v in self._volumes: + v.print_dfxml(output_fh) + output_fh.write("\n") + _logger.debug("Writing %d file objects." % len(self._files)) + for f in self._files: + e = f.to_Element() + output_fh.write(_ET_tostring(e)) + output_fh.write("\n") + output_fh.write(dfxml_foot) + output_fh.write("\n") + + def to_Element(self): + outel = self.to_partial_Element() + for v in self._volumes: + tmpel = v.to_Element() + outel.append(tmpel) + for f in self._files: + tmpel = f.to_Element() + outel.append(tmpel) + return outel + + def to_dfxml(self): + """Serializes the entire DFXML document tree into a string. Then returns that string. RAM-intensive. Most will want to use print_dfxml() instead""" + return _ET_tostring(self.to_Element()) + + def to_partial_Element(self): + outel = ET.Element("dfxml") + + tmpel0 = ET.Element("metadata") + for key in sorted(self.dc): + _typecheck(key, str) + if ":" in key: + raise ValueError("Dublin Core key-value entries should have keys without the colon character. If this causes an interesting namespace issue for you, please report it as a bug.") + tmpel1 = ET.Element("dc:" + key) + tmpel1.text = self.dc[key] + tmpel0.append(tmpel1) + outel.append(tmpel0) + + if self.command_line: + tmpel0 = ET.Element("creator") + tmpel1 = ET.Element("execution_environment") + tmpel2 = ET.Element("command_line") + tmpel2.text = self.command_line + tmpel1.append(tmpel2) + tmpel0.append(tmpel1) + outel.append(tmpel0) + + if len(self.sources) > 0: + tmpel0 = ET.Element("source") + for source in self.sources: + tmpel1 = ET.Element("image_filename") + tmpel1.text = source + tmpel0.append(tmpel1) + outel.append(tmpel0) + + if self.version: + outel.attrib["version"] = self.version + + #Apparently, namespace setting is only available with the write() function, which is memory-impractical for significant uses of DFXML. + #Ref: http://docs.python.org/3.3/library/xml.etree.elementtree.html#xml.etree.ElementTree.ElementTree.write + for prefix in self._namespaces: + attrib_name = "xmlns" + if prefix != "": + attrib_name += ":" + prefix + outel.attrib[attrib_name] = self._namespaces[prefix] + + return outel + + @property + def command_line(self): + return self._command_line + + @command_line.setter + def command_line(self, value): + self._command_line = _strcast(value) + + @property + def dc(self): + """The Dublin Core dictionary of key-value pairs for this document. Typically, "type" is "Hash List", or "Disk Image". Keys should be strings not containing colons, values should be strings. If this causes an issue for you, please report it as a bug.""" + return self._dc + + @dc.setter + def dc(self, value): + _typecheck(value, dict) + self._dc = value + + @property + def files(self): + """List of file objects directly attached to this DFXMLObject. No setter for now.""" + return self._files + + @property + def namespaces(self): + raise AttributeError("The namespaces dictionary should not be directly accessed; instead, use .iter_namespaces().") + + @property + def sources(self): + return self._sources + + @sources.setter + def sources(self, value): + if not value is None: + _typecheck(value, list) + self._sources = value + + @property + def version(self): + return self._version + + @version.setter + def version(self, value): + self._version = _strcast(value) + + @property + def volumes(self): + """List of volume objects directly attached to this DFXMLObject. No setter for now.""" + return self._volumes + + +class RegXMLObject(object): + def __init__(self, *args, **kwargs): + self.metadata = kwargs.get("metadata") + self.creator = kwargs.get("creator") + self.source = kwargs.get("source") + self.version = kwargs.get("version") + self._hives = [] + self._cells = [] + self._namespaces = dict() + input_hives = kwargs.get("hives") or [] # In case kwargs["hives"] = None. + input_cells = kwargs.get("cells") or [] + for hive in input_hives: + self.append(hive) + for cell in input_cells: + self.append(cells) + + def __iter__(self): + """Yields all HiveObjects, recursively their CellObjects, and the CellObjects directly attached to this RegXMLObject, in that order.""" + for h in self._hives: + yield h + for c in h: + yield c + for c in self._cells: + yield c + + def append(self, value): + if isinstance(value, HiveObject): + self._hives.append(value) + elif isinstance(value, CellObject): + self._cells.append(value) + else: + _logger.debug("value = %r" % value) + raise TypeError("Expecting a HiveObject or a CellObject. Got instead this type: %r." % type(value)) + + def print_regxml(self, output_fh=sys.stdout): + """Serializes and prints the entire object, without constructing the whole tree.""" + regxml_wrapper = _ET_tostring(self.to_partial_Element()) + #_logger.debug("regxml_wrapper = %r." % regxml_wrapper) + regxml_foot = "" + #Check for an empty element + if regxml_wrapper.strip()[-3:] == " />": + regxml_head = regxml_wrapper.strip()[:-3] + ">" + elif regxml_wrapper.strip()[-2:] == "/>": + regxml_head = regxml_wrapper.strip()[:-2] + ">" + else: + regxml_head = regxml_wrapper.strip()[:-len(regxml_foot)] + + output_fh.write(regxml_head) + output_fh.write("\n") + for hive in self._hives: + hive.print_regxml(output_fh) + output_fh.write(regxml_foot) + output_fh.write("\n") + + def to_Element(self): + outel = self.to_partial_Element() + + for hive in self._hives: + tmpel = hive.to_Element() + outel.append(tmpel) + + for cell in self._cells: + tmpel = cell.to_Element() + outel.append(tmpel) + + return outel + + def to_partial_Element(self): + """ + Creates the wrapping RegXML element. No hives, no cells. Saves on creating an entire Element tree in memory. + """ + outel = ET.Element("regxml") + + if self.version: + outel.attrib["version"] = self.version + + return outel + + def to_regxml(self): + """Serializes the entire RegXML document tree into a string. Returns that string. RAM-intensive. Most will want to use print_regxml() instead.""" + return _ET_tostring(self.to_Element()) + + +class VolumeObject(object): + + _all_properties = set([ + "annos", + "allocated_only", + "block_count", + "block_size", + "byte_runs", + "first_block", + "ftype", + "ftype_str", + "last_block", + "partition_offset", + "original_volume", + "sector_size" + ]) + + _diff_attr_names = { + "new":"delta:new_volume", + "deleted":"delta:deleted_volume", + "modified":"delta:modified_volume", + "matched":"delta:matched" + } + + #TODO There may be need in the future to compare the annotations as well. It complicates make_differential_dfxml too much for now. + _incomparable_properties = set([ + "annos" + ]) + + def __init__(self, *args, **kwargs): + self._files = [] + self._annos = set() + self._diffs = set() + + for prop in VolumeObject._all_properties: + if prop in ["annos", "files"]: + continue + setattr(self, prop, kwargs.get(prop)) + + def __iter__(self): + """Yields all FileObjects directly attached to this VolumeObject.""" + for f in self._files: + yield f + + def __repr__(self): + parts = [] + for prop in VolumeObject._all_properties: + #Skip outputting the files list. + if prop == "files": + continue + val = getattr(self, prop) + if not val is None: + parts.append("%s=%r" % (prop, val)) + return "VolumeObject(" + ", ".join(parts) + ")" + + def append(self, value): + _typecheck(value, FileObject) + self._files.append(value) + + def compare_to_original(self): + self._diffs = self.compare_to_other(self.original_volume, True) + + def compare_to_other(self, other, ignore_original=False): + """Returns a set of all the properties found to differ.""" + _typecheck(other, VolumeObject) + diffs = set() + for prop in VolumeObject._all_properties: + if prop in VolumeObject._incomparable_properties: + continue + if ignore_original and prop == "original_volume": + continue + + #_logger.debug("getattr(self, %r) = %r" % (prop, getattr(self, prop))) + #_logger.debug("getattr(other, %r) = %r" % (prop, getattr(other, prop))) + + #Allow file system type to be case-insensitive + if prop == "ftype_str": + o = getattr(other, prop) + if o: o = o.lower() + s = getattr(self, prop) + if s: s = s.lower() + if s != o: + diffs.add(prop) + else: + if getattr(self, prop) != getattr(other, prop): + diffs.add(prop) + return diffs + + def populate_from_Element(self, e): + global _warned_elements + _typecheck(e, (ET.Element, ET.ElementTree)) + #_logger.debug("e = %r" % e) + + #Read differential annotations + _read_differential_annotations(VolumeObject._diff_attr_names, e, self.annos) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn in ["volume", "original_volume"] + + #Look through direct-child elements to populate run array + for ce in e.findall("./*"): + #_logger.debug("ce = %r" % ce) + (cns, ctn) = _qsplit(ce.tag) + #_logger.debug("cns = %r" % cns) + #_logger.debug("ctn = %r" % ctn) + if ctn == "byte_runs": + self.byte_runs = ByteRuns() + self.byte_runs.populate_from_Element(ce) + elif ctn == "original_volume": + self.original_volume = VolumeObject() + self.original_volume.populate_from_Element(ce) + elif ctn in VolumeObject._all_properties: + #_logger.debug("ce.text = %r" % ce.text) + setattr(self, ctn, ce.text) + #_logger.debug("getattr(self, %r) = %r" % (ctn, getattr(self, ctn))) + else: + if (cns, ctn) not in _warned_elements: + _warned_elements.add((cns, ctn)) + _logger.warning("Unsure what to do with this element in a VolumeObject: %r" % ce) + + def print_dfxml(self, output_fh=sys.stdout): + pe = self.to_partial_Element() + dfxml_wrapper = _ET_tostring(pe) + + if len(pe) == 0 and len(self._files) == 0: + output_fh.write(dfxml_wrapper) + return + + dfxml_foot = "" + + #Deal with an empty element being printed as + if len(pe) == 0: + replaced_dfxml_wrapper = dfxml_wrapper.replace(" />", ">") + dfxml_head = replaced_dfxml_wrapper + else: + dfxml_head = dfxml_wrapper.strip()[:-len(dfxml_foot)] + + output_fh.write(dfxml_head) + output_fh.write("\n") + _logger.debug("Writing %d file objects for this volume." % len(self._files)) + for f in self._files: + e = f.to_Element() + output_fh.write(_ET_tostring(e)) + output_fh.write("\n") + output_fh.write(dfxml_foot) + output_fh.write("\n") + + def to_Element(self): + outel = self.to_partial_Element() + for f in self._files: + tmpel = f.to_Element() + outel.append(tmpel) + return outel + + def to_partial_Element(self): + """Returns the volume element with its properties, except for the child fileobjects. Properties are appended in DFXML schema order.""" + outel = ET.Element("volume") + + annos_whittle_set = copy.deepcopy(self.annos) + diffs_whittle_set = copy.deepcopy(self.diffs) + + #Add differential annotations + for annodiff in VolumeObject._diff_attr_names: + if annodiff in annos_whittle_set: + outel.attrib[VolumeObject._diff_attr_names[annodiff]] = "1" + annos_whittle_set.remove(annodiff) + if len(annos_whittle_set) > 0: + _logger.warning("Failed to export some differential annotations: %r." % annos_whittle_set) + + if self.byte_runs: + outel.append(self.byte_runs.to_Element()) + + def _append_el(prop, value): + tmpel = ET.Element(prop) + _keep = False + if not value is None: + tmpel.text = str(value) + _keep = True + if prop in self.diffs: + tmpel.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(prop) + _keep = True + if _keep: + outel.append(tmpel) + + def _append_str(prop): + value = getattr(self, prop) + _append_el(prop, value) + + def _append_bool(prop): + value = getattr(self, prop) + if not value is None: + value = "1" if value else "0" + _append_el(prop, value) + + for prop in [ + "partition_offset", + "sector_size", + "block_size", + "ftype", + "ftype_str", + "block_count", + "first_block", + "last_block" + ]: + _append_str(prop) + + #Output the one Boolean property + _append_bool("allocated_only") + + #Output the original volume's properties + if not self.original_volume is None or "original_volume" in diffs_whittle_set: + #Skip FileObject list, if any + if self.original_volume is None: + tmpel = ET.Element("delta:original_volume") + else: + tmpel = self.original_volume.to_partial_Element() + tmpel.tag = "delta:original_volume" + + if "original_volume" in diffs_whittle_set: + tmpel.attrib["delta:changed_property"] = "1" + + outel.append(tmpel) + + if len(diffs_whittle_set) > 0: + _logger.warning("Did not annotate all of the differing properties of this volume. Remaining properties: %r." % diffs_whittle_set) + + return outel + + @property + def allocated_only(self): + return self._allocated_only + + @allocated_only.setter + def allocated_only(self, val): + self._allocated_only = _boolcast(val) + + @property + def annos(self): + """Set of differential annotations. Expected members are the keys of this class's _diff_attr_names dictionary.""" + return self._annos + + @annos.setter + def annos(self, val): + _typecheck(val, set) + self._annos = val + + @property + def block_count(self): + return self._block_count + + @block_count.setter + def block_count(self, val): + self._block_count = _intcast(val) + + @property + def block_size(self): + return self._block_size + + @block_size.setter + def block_size(self, val): + self._block_size = _intcast(val) + + @property + def diffs(self): + return self._diffs + + @property + def first_block(self): + return self._first_block + + @first_block.setter + def first_block(self, val): + self._first_block = _intcast(val) + + @property + def ftype(self): + return self._ftype + + @ftype.setter + def ftype(self, val): + self._ftype = _intcast(val) + + @property + def ftype_str(self): + return self._ftype_str + + @ftype_str.setter + def ftype_str(self, val): + self._ftype_str = _strcast(val) + + @property + def last_block(self): + return self._last_block + + @last_block.setter + def last_block(self, val): + self._last_block = _intcast(val) + + @property + def original_volume(self): + return self._original_volume + + @original_volume.setter + def original_volume(self, val): + if not val is None: + _typecheck(val, VolumeObject) + self._original_volume= val + + @property + def partition_offset(self): + return self._partition_offset + + @partition_offset.setter + def partition_offset(self, val): + self._partition_offset = _intcast(val) + + @property + def sector_size(self): + return self._sector_size + + @sector_size.setter + def sector_size(self, val): + self._sector_size = _intcast(val) + +class HiveObject(object): + def __init__(self, *args, **kwargs): + self._cells = [] + + def __iter__(self): + """Yields all CellObjects directly attached to this VolumeObject.""" + for c in self._cells: + yield c + + def append(self, value): + _typecheck(value, CellObject) + self._cells.append(value) + + def print_regxml(self, output_fh=sys.stdout): + for cell in self._cells: + output_fh.write(cell.to_regxml()) + output_fh.write("\n") + + def to_Element(self): + outel = ET.Element("hive") + for cell in self._cells: + tmpel = cell.to_Element() + outel.append(tmpel) + return outel + +class ByteRun(object): + + _all_properties = set([ + "img_offset", + "fs_offset", + "file_offset", + "fill", + "len" + ]) + + def __init__(self, *args, **kwargs): + for prop in ByteRun._all_properties: + setattr(self, prop, kwargs.get(prop)) + + def __add__(self, other): + """ + Joins two ByteRun objects into a single run if possible. Returns a new object of the concatenation if successful, None if not. + """ + _typecheck(other, ByteRun) + #Don't glom fills of different values + if self.fill != other.fill: + return None + + if None in [self.len, other.len]: + return None + + for prop in ["img_offset", "fs_offset", "file_offset"]: + if None in [getattr(self, prop), getattr(other, prop)]: + continue + if getattr(self, prop) + self.len == getattr(other, prop): + retval = copy.deepcopy(self) + retval.len += other.len + return retval + return None + + def __eq__(self, other): + #Check type + if other is None: + return False + if not isinstance(other, ByteRun): + if not _warned_byterun_badtypecomp: + _logger.warning("A ByteRun comparison was called against a non-ByteRun object: " + repr(other) + ".") + _warned_byterun_badtypecomp = True + return False + + #Check values + return \ + self.img_offset == other.img_offset and \ + self.fs_offset == other.fs_offset and \ + self.file_offset == other.file_offset and \ + self.fill == other.fill and \ + self.len == other.len + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + for prop in ByteRun._all_properties: + val = getattr(self, prop) + if not val is None: + parts.append("%s=%r" % (prop, val)) + return "ByteRun(" + ", ".join(parts) + ")" + + def populate_from_Element(self, e): + _typecheck(e, (ET.Element, ET.ElementTree)) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn == "byte_run" + + copied_attrib = copy.deepcopy(e.attrib) + + #Populate run properties from element attributes + for prop in ByteRun._all_properties: + if prop in copied_attrib: + val = copied_attrib.get(prop) + if not val is None: + setattr(self, prop, val) + del copied_attrib[prop] + #Note remaining properties + for prop in copied_attrib: + if prop not in _warned_byterun_attribs: + _warned_byterun_attribs.add(prop) + _logger.warning("No instructions present for processing this attribute found on a byte run: %r." % prop) + + def to_Element(self): + outel = ET.Element("byte_run") + for prop in ByteRun._all_properties: + val = getattr(self, prop) + if not val is None: + outel.attrib[prop] = str(val) + return outel + + @property + def file_offset(self): + return self._file_offset + + @file_offset.setter + def file_offset(self, val): + self._file_offset = _intcast(val) + + @property + def fill(self): + """There is an implicit assumption that the fill character is encoded as UTF-8.""" + return self._fill + + @fill.setter + def fill(self, val): + self._fill = _bytecast(val) + + @property + def fs_offset(self): + return self._fs_offset + + @fs_offset.setter + def fs_offset(self, val): + self._fs_offset = _intcast(val) + + @property + def img_offset(self): + return self._img_offset + + @img_offset.setter + def img_offset(self, val): + self._img_offset = _intcast(val) + + @property + def len(self): + return self._len + + @len.setter + def len(self, val): + self._len = _intcast(val) + +class ByteRuns(object): + """ + A list-like object for ByteRun objects. + """ + #Must define these methods to adhere to the list protocol: + #__len__ + #__getitem__ + #__setitem__ + #__delitem__ + #__iter__ + #append + # + #Refs: + #http://www.rafekettler.com/magicmethods.html + #http://stackoverflow.com/a/8841520 + + _facet_values = [None, "data", "inode", "name"] + + def __init__(self, run_list=None, **kwargs): + self._facet = kwargs.get("facet") + self._listdata = [] + if isinstance(run_list, list): + for run in run_list: + self.append(run) + + def __delitem__(self, key): + del self._listdata[key] + + def __eq__(self, other): + """Compares the byte run lists and the facet (allowing a null facet to match "data").""" + #Check type + if other is None: + return False + _typecheck(other, ByteRuns) + + if self.facet != other.facet: + if set([self.facet, other.facet]) != set([None, "data"]): + return False + if len(self) != len(other): + #_logger.debug("len(self) = %d" % len(self)) + #_logger.debug("len(other) = %d" % len(other)) + return False + for (sbr_index, sbr) in enumerate(self): + obr = other[sbr_index] + #_logger.debug("sbr_index = %d" % sbr_index) + #_logger.debug("sbr = %r" % sbr) + #_logger.debug("obr = %r" % obr) + if sbr != obr: + return False + return True + + def __getitem__(self, key): + return self._listdata.__getitem__(key) + + def __iter__(self): + return iter(self._listdata) + + def __len__(self): + return self._listdata.__len__() + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + for run in self: + parts.append(repr(run)) + maybe_facet = "" + if self.facet: + maybe_facet = "facet=%r, " % self.facet + return "ByteRuns(" + maybe_facet + "run_list=[" + ", ".join(parts) + "])" + + def __setitem__(self, key, value): + _typecheck(value, ByteRun) + self._listdata[key] = value + + def append(self, value): + """ + Appends a ByteRun object to this container's list. + """ + _typecheck(value, ByteRun) + self._listdata.append(value) + + def glom(self, value): + """ + Appends a ByteRun object to this container's list, after attempting to join the run with the last run already stored. + """ + _typecheck(value, ByteRun) + if len(self._listdata) == 0: + self.append(value) + else: + last_run = self._listdata[-1] + maybe_new_run = last_run + value + if maybe_new_run is None: + self.append(value) + else: + self._listdata[-1] = maybe_new_run + + def iter_contents(self, raw_image, buffer_size=1048576, sector_size=512, errlog=None, statlog=None): + """ + Generator. Yields contents, as byte strings one block at a time, given a backing raw image path. Relies on The SleuthKit's img_cat, so contents can be extracted from any disk image type that TSK supports. + @param buffer_size The maximum size of the byte strings yielded. + @param sector_size The size of a disk sector in the raw image. Required by img_cat. + """ + if not isinstance(raw_image, str): + raise TypeError("iter_contents needs the string path to the image file. Received: %r." % raw_image) + + stderr_fh = None + if not errlog is None: + stderr_fh = open(errlog, "wb") + + status_fh = None + if not statlog is None: + status_fh = open(errlog, "wb") + + #The exit status of the last img_cat. + last_status = None + + try: + for run in self: + if run.len is None: + raise AttributeError("Byte runs can't be extracted if a run length is undefined.") + + len_to_read = run.len + + #If we have a fill character, just pump out that character + if not run.fill is None and len(run.fill) > 0: + while len_to_read > 0: + #This multiplication and slice should handle multi-byte fill characters, in case that ever comes up. + yield (run.fill * buffer_size)[ : min(len_to_read, buffer_size)] + len_to_read -= buffer_size + #Next byte run + continue + + if run.img_offset is None: + raise AttributeError("Byte runs can't be extracted if missing a fill character and image offset.") + + cmd = ["img_cat"] + cmd.append("-b") + cmd.append(str(sector_size)) + cmd.append("-s") + cmd.append(str(run.img_offset//sector_size)) + cmd.append("-e") + cmd.append(str( (run.img_offset + run.len)//sector_size)) + cmd.append(raw_image) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=stderr_fh) + + #Do the buffered read + while len_to_read > 0: + buffer_data = p.stdout.read(buffer_size) + yield_data = buffer_data[ : min(len_to_read, buffer_size)] + if len(yield_data) > 0: + yield yield_data + else: + #Let the subprocess terminate so we can see the exit status + p.wait() + last_status = p.returncode + if last_status != 0: + raise subprocess.CalledProcessError(last_status, " ".join(cmd), "img_cat failed.") + len_to_read -= buffer_size + except Exception as e: + #Cleanup in an exception + if not stderr_fh is None: + stderr_fh.close() + + if not status_fh is None: + if isinstance(e, subprocess.CalledProcessError): + status_fh.write(e.returncode) + else: + status_fh.write("1") + status_fh.close() + raise e + + #Cleanup when all's gone well. + if not status_fh is None: + if not last_status is None: + status_fh.write(last_status) + status_fh.close() + if not stderr_fh is None: + stderr_fh.close() + + def populate_from_Element(self, e): + _typecheck(e, (ET.Element, ET.ElementTree)) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn == "byte_runs" + + if "facet" in e.attrib: + self.facet = e.attrib["facet"] + + #Look through direct-child elements to populate run array + for ce in e.findall("./*"): + (cns, ctn) = _qsplit(ce.tag) + if ctn == "byte_run": + nbr = ByteRun() + nbr.populate_from_Element(ce) + self.append(nbr) + + def to_Element(self): + outel = ET.Element("byte_runs") + for run in self: + tmpel = run.to_Element() + outel.append(tmpel) + if self.facet: + outel.attrib["facet"] = self.facet + return outel + + @property + def facet(self): + """Expected to be null, "data", "inode", or "name". See FileObject.data_brs, FileObject.inode_brs, and FileObject.name_brs.""" + return self._facet + + @facet.setter + def facet(self, val): + if not val is None: + _typecheck(val, str) + if val not in ByteRuns._facet_values: + raise ValueError("A ByteRuns facet must be one of these: %r. Received: %r." % (ByteRuns._facet_values, val)) + self._facet = val + +re_precision = re.compile(r"(?P\d+)(?P(|m|n)s|d)?") +class TimestampObject(object): + """ + Encodes the "dftime" type. Wraps around dfxml.dftime, closely enough that this might just get folded into that class. + + TimestampObjects implement a vs-null comparison workaround as in the SAS family of products: Null, for ordering purposes, is considered to be a value less than negative infinity. + """ + + timestamp_name_list = ["mtime", "atime", "ctime", "crtime", "dtime", "bkup_time"] + + def __init__(self, *args, **kwargs): + self.name = kwargs.get("name") + self.prec = kwargs.get("prec") + #_logger.debug("type(args) = %r" % type(args)) + #_logger.debug("args = %r" % (args,)) + if len(args) == 0: + self.time = None + elif len(args) == 1: + self.time = args[0] + else: + raise ValueError("Unexpected arguments. Whole args tuple: %r." % (args,)) + + self._timestamp = None + + def __eq__(self, other): + #Check type + if other is None: + return False + _typecheck(other, TimestampObject) + + if self.name != other.name: + return False + if self.prec != other.prec: + return False + if self.time != other.time: + return False + return True + + def __ge__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return False + else: + self._comparison_sanity_check(other) + return self.time.__ge__(other.time) + + def __gt__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return False + else: + self._comparison_sanity_check(other) + return self.time.__gt__(other.time) + + def __le__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return True + else: + self._comparison_sanity_check(other) + return self.time.__le__(other.time) + + def __lt__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return True + else: + self._comparison_sanity_check(other) + return self.time.__lt__(other.time) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + if self.name: + parts.append("name=%r" % self.name) + if self.prec: + parts.append("prec=%r" % (self.prec,)) + if self.time: + parts.append("%r" % self.time) + return "TimestampObject(" + ", ".join(parts) + ")" + + def __str__(self): + if self.time: + return str(self.time) + else: + return self.__repr__() + + def _comparison_sanity_check(self, other): + if None in (self.time, other.time): + raise ValueError("Can't compare TimestampObjects: %r, %r." % self, other) + + def populate_from_Element(self, e): + _typecheck(e, (ET.Element, ET.ElementTree)) + if "prec" in e.attrib: + self.prec = e.attrib["prec"] + self.time = e.text + (ns, tn) = _qsplit(e.tag) + self.name = tn + + def to_Element(self): + _typecheck(self.name, str) + outel = ET.Element(self.name) + if self.prec: + outel.attrib["prec"] = "%d%s" % self.prec + if self.time: + outel.text = str(self.time) + return outel + + @property + def name(self): + """The type of timestamp - modified (mtime), accessed (atime), etc.""" + return self._name + + @name.setter + def name(self, value): + if not value is None: + if not value in TimestampObject.timestamp_name_list: + raise ValueError("The timestamp name must be in this list: %r. Received: %r." % (TimestampObject.timestamp_name_list, value)) + self._name = value + + @property + def prec(self): + """ + A pair, (resolution, unit); unit is a second (s), millisecond, nanosecond, or day (d). The default unit is "s". Can be passed as a string or a duple. + """ + return self._prec + + @prec.setter + def prec(self, value): + if value is None: + self._prec = None + return self._prec + elif isinstance(value, tuple) and \ + len(value) == 2 and \ + isinstance(value[0], int) and \ + isinstance(value[1], str): + self._prec = value + return self._prec + + m = re_precision.match(value) + md = m.groupdict() + tup = (int(md["num"]), md.get("unit") or "s") + #_logger.debug("tup = %r" % (tup,)) + self._prec = tup + + @property + def time(self): + """ + The actual timestamp. A DFXML.dftime object. This class might be superfluous and end up collapsing into that... + """ + return self._time + + @time.setter + def time(self, value): + if value is None: + self._time = None + else: + checked_value = dfxml.dftime(value) + #_logger.debug("checked_value.timestamp() = %r" % checked_value.timestamp()) + self._time = checked_value + #Propagate timestamp value to other formats + self._timestamp = self._time.timestamp() + + @property + def timestamp(self): + """A Unix floating-point timestamp, as time.mktime returns. Currently, there is no setter for this property.""" + return self._timestamp + + +class FileObject(object): + """ + This class provides property accesses, an XML serializer (ElementTree-based), and a deserializer. + The properties interface is NOT function calls, but simple accesses. That is, the old _fileobject_ style: + + assert isinstance(fi, dfxml.fileobject) + fi.mtime() + + is now replaced with: + + assert isinstance(fi, Objects.FileObject) + fi.mtime + """ + + _all_properties = set([ + "alloc", + "alloc_inode", + "alloc_name", + "annos", + "atime", + "bkup_time", + "byte_runs", + "compressed", + "crtime", + "ctime", + "data_brs", + "dtime", + "error", + "filename", + "filesize", + "gid", + "id", + "inode", + "inode_brs", + "link_target", + "libmagic", + "md5", + "meta_type", + "mode", + "mtime", + "name_brs", + "name_type", + "nlink", + "original_fileobject", + "orphan", + "parent_object", + "partition", + "seq", + "sha1", + "uid", + "unalloc", + "unused", + "used" + ]) + + _br_facet_to_property = { + "data":"data_brs", + "inode":"inode_brs", + "name":"name_brs" + } + + #TODO There may be need in the future to compare the annotations as well. It complicates make_differential_dfxml too much for now. + _incomparable_properties = set([ + "annos", + "byte_runs", + "id", + "unalloc", + "unused" + ]) + + _diff_attr_names = { + "new":"delta:new_file", + "deleted":"delta:deleted_file", + "renamed":"delta:renamed_file", + "changed":"delta:changed_file", + "modified":"delta:modified_file", + "matched":"delta:matched" + } + + def __init__(self, *args, **kwargs): + #Prime all the properties + for prop in FileObject._all_properties: + if prop == "annos": + continue + setattr(self, prop, kwargs.get(prop)) + self._annos = set() + self._diffs = set() + + def __eq__(self, other): + if other is None: + return False + _typecheck(other, FileObject) + for prop in FileObject._all_properties: + if prop in FileObject._incomparable_properties: + continue + if getattr(self, prop) != getattr(other, prop): + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + + for prop in sorted(FileObject._all_properties): + #Save data byte runs for the end, as theirs lists can get really long. + if prop not in ["byte_runs", "data_brs"]: + value = getattr(self, prop) + if not value is None: + parts.append("%s=%r" % (prop, value)) + + if self.data_brs: + parts.append("data_brs=%r" % self.byte_runs) + + return "FileObject(" + ", ".join(parts) + ")" + + def compare_to_original(self): + self._diffs = self.compare_to_other(self.original_fileobject, True) + + def compare_to_other(self, other, ignore_original=False): + _typecheck(other, FileObject) + + diffs = set() + + for propname in FileObject._all_properties: + if propname in FileObject._incomparable_properties: + continue + if ignore_original and propname == "original_fileobject": + continue + oval = getattr(other, propname) + sval = getattr(self, propname) + if oval is None and sval is None: + continue + if oval != sval: + #_logger.debug("propname, oval, sval: %r, %r, %r" % (propname, oval, sval)) + diffs.add(propname) + + return diffs + + def extract_facet(self, facet, image_path=None, buffer_size=1048576, partition_offset=None, sector_size=512, errlog=None, statlog=None, icat_threshold = 268435456): + """ + Generator. Extracts the facet with a SleuthKit tool, yielding chunks of the data. + + @param buffer_size The facet data is yielded in chunks of at most this parameter's size. Default 1MiB. + @param partition_offset The offset of the file's containing partition, in bytes. Needed for icat. If not given, the FileObject's VolumeObject will be used. If that's also absent, icat can't be used, and img_cat will instead be tried as a fallback (which means byte runs must be in the DFXML). + @param icat_threshold icat incurs extensive, non-sequential IO overhead to walk the filesystem to reach the facet's byte runs. img_cat can be called on each byte run reported in the DFXML file, but on fragmented files this incurs overhead in process spawning. Facets larger than this threshold are extracted with icat. Default 256MiB. Force icat by setting this to -1; force img_cat with infinity (float("inf")). + """ + + _image_path = image_path + if _image_path is None: + raise ValueError("The backing image path must be supplied.") + + _partition_offset = partition_offset + if _partition_offset is None: + if self.volume_object: + _partition_offset = self.volume_object.partition_offset + + #Try using icat; needs inode number and volume offset. We're additionally requiring the filesize be known. + #TODO The icat needs a little more experimentation. + if False and facet == "content" and \ + not self.filesize is None and \ + self.filesize >= icat_threshold and \ + not self.inode is None and \ + not _partition_offset is None: + _logger.debug("Extracting with icat: %r." % self) + + #Set up logging if desired + stderr_fh = sys.stderr + if not errlog is None: + stderr_fh = open(errlog, "wb") + + status_fh = None + if not statlog is None: + status_fh = open(errlog, "w") + + #Set up icat process + cmd = ["icat"] + cmd.append("-b") + cmd.append(str(sector_size)) + cmd.append("-o") + cmd.append(str(self.volume_object.partition_offset//sector_size)) + if not self.volume_object.ftype_str is None: + cmd.append("-f") + cmd.append(self.volume_object.ftype_str) + cmd.append(image_path) + cmd.append(str(self.inode)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=stderr_fh) + + #Do a buffered read + len_to_read = self.filesize + while len_to_read > 0: + buffer_data = p.stdout.read(buffer_size) + yield_data = buffer_data[ : min(len_to_read, buffer_size)] + if len(yield_data) > 0: + yield yield_data + else: + #Let the subprocess terminate so we can see the exit status + p.wait() + last_status = p.returncode + + #Log the status if requested + if not status_fh is None: + status_fh.write(last_status) + + #Act on a bad status + if last_status != 0: + raise subprocess.CalledProcessError(last_status, " ".join(cmd), "icat failed.") + len_to_read -= buffer_size + + #Clean up file handles + if status_fh: status_fh.close() + if stderr_fh: stderr_fh.close() + + elif not self.byte_runs is None: + for chunk in self.byte_runs.iter_contents(_image_path, buffer_size, sector_size, errlog, statlog): + yield chunk + + def populate_from_Element(self, e): + """Populates this FileObject's properties from an ElementTree Element. The Element need not be retained.""" + global _warned_elements + _typecheck(e, (ET.Element, ET.ElementTree)) + + #_logger.debug("FileObject.populate_from_Element(%r)" % e) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn in ["fileobject", "original_fileobject", "parent_object"] + + #Map "delta:" attributes of s into the self.annos set + #_logger.debug("self.annos, before: %r." % self.annos) + _read_differential_annotations(FileObject._diff_attr_names, e, self.annos) + #_logger.debug("self.annos, after: %r." % self.annos) + + #Look through direct-child elements for other properties + for ce in e.findall("./*"): + (cns, ctn) = _qsplit(ce.tag) + #_logger.debug("Populating from child element: %r." % ce.tag) + + #Inherit any marked changes + for attr in ce.attrib: + #_logger.debug("Inspecting attr for diff. annos: %r." % attr) + (ns, an) = _qsplit(attr) + if an == "changed_property" and ns == dfxml.XMLNS_DELTA: + #_logger.debug("Identified changed property: %r." % ctn) + #TODO There may be a more elegant way of handling the hashes and any other attribute-dependent element-to-property mapping. Probably involving XPath. + if ctn == "hashdigest": + if "type" not in ce.attrib: + raise AttributeError("Attribute 'type' not found. Every hashdigest element should have a 'type' attribute to identify the hash type.") + self.diffs.add(ce.attrib["type"].lower()) + elif ctn == "byte_runs": + facet = ce.attrib.get("facet") + prop = FileObject._br_facet_to_property.get(facet, "data_brs") + self.diffs.add(prop) + else: + self.diffs.add(ctn) + + if ctn == "byte_runs": + #byte_runs might be for file contents, the inode/MFT entry, or the directory entry naming the file. Use the facet attribute to determine which. If facet is absent, assume they're data byte runs. + if "facet" in ce.attrib: + if ce.attrib["facet"] not in FileObject._br_facet_to_property: + if not ce.attrib["facet"] in _warned_byterun_facets: + _warned_byterun_facets.add(ce.attrib["facet"]) + _logger.warning("byte_runs facet %r was unexpected. Will not interpret this element.") + else: + brs = ByteRuns() + brs.populate_from_Element(ce) + brs.facet = ce.attrib["facet"] + setattr(self, FileObject._br_facet_to_property[brs.facet], brs) + else: + self.byte_runs = ByteRuns() + self.byte_runs.populate_from_Element(ce) + elif ctn == "hashdigest": + if ce.attrib["type"].lower() == "md5": + self.md5 = ce.text + elif ce.attrib["type"].lower() == "sha1": + self.sha1 = ce.text + elif ctn == "original_fileobject": + self.original_fileobject = FileObject() + self.original_fileobject.populate_from_Element(ce) + elif ctn == "parent_object": + self.parent_object = FileObject() + self.parent_object.populate_from_Element(ce) + elif ctn in ["atime", "bkup_time", "crtime", "ctime", "dtime", "mtime"]: + setattr(self, ctn, TimestampObject()) + getattr(self, ctn).populate_from_Element(ce) + elif ctn in FileObject._all_properties: + setattr(self, ctn, ce.text) + else: + if (cns, ctn) not in _warned_elements: + _warned_elements.add((cns, ctn)) + _logger.warning("Uncertain what to do with this element: %r" % ce) + + def populate_from_stat(self, s): + """Populates FileObject fields from a stat() call.""" + import os + _typecheck(s, os.stat_result) + + self.mode = s.st_mode + self.inode = s.st_ino + self.nlink = s.st_nlink + self.uid = s.st_uid + self.gid = s.st_gid + self.filesize = s.st_size + #s.st_dev is ignored for now. + + if "st_mtime" in dir(s): + self.mtime = s.st_mtime + + if "st_atime" in dir(s): + self.atime = s.st_atime + + if "st_ctime" in dir(s): + self.ctime = s.st_ctime + + if "st_birthtime" in dir(s): + self.crtime = s.st_birthtime + + def to_Element(self): + """Creates an ElementTree Element with elements in DFXML schema order.""" + outel = ET.Element("fileobject") + + annos_whittle_set = copy.deepcopy(self.annos) + diffs_whittle_set = copy.deepcopy(self.diffs) + + for annodiff in FileObject._diff_attr_names: + if annodiff in annos_whittle_set: + outel.attrib[FileObject._diff_attr_names[annodiff]] = "1" + annos_whittle_set.remove(annodiff) + if len(annos_whittle_set) > 0: + _logger.warning("Failed to export some differential annotations: %r." % annos_whittle_set) + + def _anno_change(el): + if el.tag in self.diffs: + el.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(el.tag) + + def _anno_hash(el): + if el.attrib["type"] in self.diffs: + el.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(el.attrib["type"]) + + def _anno_byte_runs(el): + if "facet" in el.attrib: + prop = FileObject._br_facet_to_property[el.attrib["facet"]] + else: + prop = "data_brs" + if prop in self.diffs: + el.attrib["delta:changed_property"] = "1" + #_logger.debug("diffs_whittle_set = %r." % diffs_whittle_set) + diffs_whittle_set.remove(prop) + + #Recall that Element text must be a string + def _append_str(name, value): + """Note that empty elements should be created if the element was removed.""" + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element(name) + if not value is None: + tmpel.text = str(value) + _anno_change(tmpel) + outel.append(tmpel) + + def _append_time(name, value): + """Note that empty elements should be created if the element was removed.""" + if not value is None or name in diffs_whittle_set: + if not value is None and value.time: + tmpel = value.to_Element() + else: + tmpel = ET.Element(name) + _anno_change(tmpel) + outel.append(tmpel) + + def _append_bool(name, value): + """Note that empty elements should be created if the element was removed.""" + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element(name) + if not value is None: + tmpel.text = str(1 if value else 0) + _anno_change(tmpel) + outel.append(tmpel) + + _using_facets = False + def _append_byte_runs(name, value): + """The complicated part here is setting the "data" facet on the byte runs, because we assume that no facet definitions means that for this file, there's only the one byte_runs list for data.""" + #_logger.debug("_append_byte_runs(%r, %r)" % (name, value)) + if value or name in diffs_whittle_set: + if value: + tmpel = value.to_Element() + if "facet" in tmpel.attrib: + _using_facets = True + else: + tmpel = ET.Element("byte_runs") + propname_to_facet = { + "data_brs": "data", + "inode_brs": "inode", + "name_brs": "name" + } + if name in propname_to_facet: + _using_facets = True + tmpel.attrib["facet"] = propname_to_facet[name] + elif _using_facets: + tmpel.attrib["facet"] = propname_to_facet["data_brs"] + _anno_byte_runs(tmpel) + outel.append(tmpel) + + def _append_object(name, value, namespace_prefix=None): + """name must be the name of a property that has a to_Element() method. namespace_prefix will be prepended as-is to the element tag.""" + obj = value + if obj or name in diffs_whittle_set: + if obj: + tmpel = obj.to_Element() + else: + tmpel = ET.Element(name) + #Set the tag name here for properties like parent_object, a FileObject without being wholly a FileObject. + if namespace_prefix: + tmpel.tag = namespace_prefix + name + else: + tmpel.tag = name + _anno_change(tmpel) + outel.append(tmpel) + + def _append_hash(name, value): + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element("hashdigest") + tmpel.attrib["type"] = name + if not value is None: + tmpel.text = value + _anno_hash(tmpel) + outel.append(tmpel) + + #The parent object is a one-off. Duplicating the whole parent is wasteful, so create a shadow object that just outputs the important bits. + if not self.parent_object is None: + parent_object_shadow = FileObject() + parent_object_shadow.inode = self.parent_object.inode + _append_object("parent_object", parent_object_shadow) + + _append_str("filename", self.filename) + _append_str("error", self.error) + _append_str("partition", self.partition) + _append_str("id", self.id) + _append_str("name_type", self.name_type) + _append_str("filesize", self.filesize) + #TODO Define a better flag for if we're going to output elements. + if self.alloc_name is None and self.alloc_inode is None: + _append_bool("alloc", self.alloc) + else: + _append_bool("alloc_inode", self.alloc_inode) + _append_bool("alloc_name", self.alloc_name) + _append_bool("used", self.used) + _append_bool("orphan", self.orphan) + _append_bool("compressed", self.compressed) + _append_str("inode", self.inode) + _append_str("meta_type", self.meta_type) + _append_str("mode", self.mode) + _append_str("nlink", self.nlink) + _append_str("uid", self.uid) + _append_str("gid", self.gid) + _append_time("mtime", self.mtime) + _append_time("ctime", self.ctime) + _append_time("atime", self.atime) + _append_time("crtime", self.crtime) + _append_str("seq", self.seq) + _append_time("dtime", self.dtime) + _append_time("bkup_time", self.bkup_time) + _append_str("link_target", self.link_target) + _append_str("libmagic", self.libmagic) + _append_byte_runs("inode_brs", self.inode_brs) + _append_byte_runs("name_brs", self.name_brs) + _append_byte_runs("data_brs", self.data_brs) + _append_hash("md5", self.md5) + _append_hash("sha1", self.sha1) + _append_object("original_fileobject", self.original_fileobject, "delta:") + + if len(diffs_whittle_set) > 0: + _logger.warning("Did not annotate all of the differing properties of this file. Remaining properties: %r." % diffs_whittle_set) + + return outel + + def to_dfxml(self): + return _ET_tostring(self.to_Element()) + + @property + def alloc(self): + """Note that setting .alloc will affect the value of .unalloc, and vice versa. The last one to set wins.""" + global _nagged_alloc + if not _nagged_alloc: + _logger.warning("The FileObject.alloc property is deprecated. Use .alloc_inode and/or .alloc_name instead. .alloc is proxied as True if alloc_inode and alloc_name are both True.") + _nagged_alloc = True + if self.alloc_inode and self.alloc_name: + return True + else: + return self._alloc + + @alloc.setter + def alloc(self, val): + self._alloc = _boolcast(val) + if not self._alloc is None: + self._unalloc = not self._alloc + + @property + def alloc_inode(self): + return self._alloc_inode + + @alloc_inode.setter + def alloc_inode(self, val): + self._alloc_inode = _boolcast(val) + + @property + def alloc_name(self): + return self._alloc_name + + @alloc_name.setter + def alloc_name(self, val): + self._alloc_name = _boolcast(val) + + @property + def annos(self): + """Set of differential annotations. Expected members are the keys of this class's _diff_attr_names dictionary.""" + return self._annos + + @annos.setter + def annos(self, val): + _typecheck(val, set) + self._annos = val + + @property + def atime(self): + return self._atime + + @atime.setter + def atime(self, val): + if val is None: + self._atime = None + elif isinstance(val, TimestampObject): + self._atime = val + else: + checked_val = TimestampObject(val, name="atime") + self._atime = checked_val + + @property + def bkup_time(self): + return self._bkup_time + + @bkup_time.setter + def bkup_time(self, val): + if val is None: + self._bkup_time = None + elif isinstance(val, TimestampObject): + self._bkup_time = val + else: + checked_val = TimestampObject(val, name="bkup_time") + self._bkup_time = checked_val + + @property + def byte_runs(self): + """This property is now a synonym for the data byte runs (.data_brs).""" + return self.data_brs + + @byte_runs.setter + def byte_runs(self, val): + self.data_brs = val + + @property + def compressed(self): + return self._compressed + + @compressed.setter + def compressed(self, val): + self._compressed = _boolcast(val) + + @property + def ctime(self): + return self._ctime + + @ctime.setter + def ctime(self, val): + if val is None: + self._ctime = None + elif isinstance(val, TimestampObject): + self._ctime = val + else: + checked_val = TimestampObject(val, name="ctime") + self._ctime = checked_val + + @property + def crtime(self): + return self._crtime + + @crtime.setter + def crtime(self, val): + if val is None: + self._crtime = None + elif isinstance(val, TimestampObject): + self._crtime = val + else: + checked_val = TimestampObject(val, name="crtime") + self._crtime = checked_val + + @property + def data_brs(self): + """The byte runs that store the file's content.""" + return self._data_brs + + @data_brs.setter + def data_brs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._data_brs = val + + @property + def diffs(self): + """This property intentionally has no setter. To populate, call compare_to_original() after assigning an original_fileobject.""" + return self._diffs + + @property + def dtime(self): + return self._dtime + + @dtime.setter + def dtime(self, val): + if val is None: + self._dtime = None + elif isinstance(val, TimestampObject): + self._dtime = val + else: + checked_val = TimestampObject(val, name="dtime") + self._dtime = checked_val + + @property + def error(self): + return self._error + + @error.setter + def error(self, val): + self._error = _strcast(val) + + @property + def filesize(self): + return self._filesize + + @filesize.setter + def filesize(self, val): + self._filesize = _intcast(val) + + @property + def gid(self): + return self._gid + + @gid.setter + def gid(self, val): + self._gid = _strcast(val) + + @property + def id(self): + return self._id + + @id.setter + def id(self, val): + self._id = _intcast(val) + + @property + def inode(self): + return self._inode + + @inode.setter + def inode(self, val): + self._inode = _intcast(val) + + @property + def libmagic(self): + return self._libmagic + + @libmagic.setter + def libmagic(self, val): + self._libmagic = _strcast(val) + + @property + def inode_brs(self): + """The byte run(s) that represents the file's metadata object (the inode or the MFT entry). In file systems that do not distinguish between inode and directory entry, e.g. FAT, .inode_brs should be equivalent to .name_brs, if both fields are present.""" + return self._inode_brs + + @inode_brs.setter + def inode_brs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._inode_brs = val + + @property + def meta_type(self): + return self._meta_type + + @meta_type.setter + def meta_type(self, val): + self._meta_type = _intcast(val) + + @property + def mode(self): + """The security mode is represented in the FileObject as a base-10 integer. It is also serialized as a decimal integer.""" + return self._mode + + @mode.setter + def mode(self, val): + self._mode = _intcast(val) + + @property + def mtime(self): + return self._mtime + + @mtime.setter + def mtime(self, val): + if val is None: + self._mtime = None + elif isinstance(val, TimestampObject): + self._mtime = val + else: + checked_val = TimestampObject(val, name="mtime") + self._mtime = checked_val + + @property + def name_brs(self): + """The byte run(s) that represents the file's name object (the directory entry). In file systems that do not distinguish between inode and directory entry, e.g. FAT, .inode_brs should be equivalent to .name_brs, if both fields are present.""" + return self._name_brs + + @name_brs.setter + def name_brs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._name_brs = val + + @property + def name_type(self): + return self._name_type + + @name_type.setter + def name_type(self, val): + if val is None: + self._name_type = val + else: + cast_val = _strcast(val) + if cast_val not in ["-", "p", "c", "d", "b", "r", "l", "s", "h", "w", "v"]: + raise ValueError("Unexpected name_type received: %r (casted to %r)." % (val, cast_val)) + self._name_type = cast_val + + @property + def nlink(self): + return self._nlink + + @nlink.setter + def nlink(self, val): + self._nlink = _intcast(val) + + @property + def orphan(self): + return self._orphan + + @orphan.setter + def orphan(self, val): + self._orphan = _boolcast(val) + + @property + def original_fileobject(self): + return self._original_fileobject + + @original_fileobject.setter + def original_fileobject(self, val): + if not val is None: + _typecheck(val, FileObject) + self._original_fileobject = val + + @property + def partition(self): + return self._partition + + @partition.setter + def partition(self, val): + self._partition = _intcast(val) + + @property + def parent_object(self): + """This object is an extremely sparse FileObject, containing just identifying information. Alternately, it can be an entire object reference to the parent Object, though uniqueness should be checked.""" + return self._parent_object + + @parent_object.setter + def parent_object(self, val): + if not val is None: + _typecheck(val, FileObject) + self._parent_object = val + + @property + def seq(self): + return self._seq + + @seq.setter + def seq(self, val): + self._seq = _intcast(val) + + @property + def uid(self): + return self._uid + + @uid.setter + def uid(self, val): + self._uid = _strcast(val) + + @property + def unalloc(self): + """Note that setting .unalloc will affect the value of .alloc, and vice versa. The last one to set wins.""" + return self._unalloc + + @unalloc.setter + def unalloc(self, val): + self._unalloc = _boolcast(val) + if not self._unalloc is None: + self._alloc = not self._unalloc + + @property + def unused(self): + return self._used + + @unused.setter + def unused(self, val): + self._unused = _intcast(val) + if not self._unused is None: + self._used = not self._unused + + @property + def used(self): + return self._used + + @used.setter + def used(self, val): + self._used = _intcast(val) + if not self._used is None: + self._unused = not self._used + + @property + def volume_object(self): + """Reference to the containing volume object. Not meant to be propagated with __repr__ or to_Element().""" + return self._volume_object + + @volume_object.setter + def volume_object(self, val): + if not val is None: + _typecheck(val, VolumeObject) + self._volume_object = val + + +class CellObject(object): + + _all_properties = set([ + "alloc", + "annos", + "byte_runs", + "cellpath", + "mtime", + "name", + "name_type", + "original_cellobject", + "parent_object", + "root" + ]) + + _diff_attr_names = { + "new":"delta:new_cell", + "deleted":"delta:deleted_cell", + "changed":"delta:changed_cell", + "modified":"delta:modified_cell", + "matched":"delta:matched" + } + + #TODO There may be need in the future to compare the annotations as well. + _incomparable_properties = set([ + "annos" + ]) + + def __init__(self, *args, **kwargs): + #These properties must be assigned first for sanity check dependencies + self.name_type = kwargs.get("name_type") + + for prop in CellObject._all_properties: + if prop == "annos": + setattr(self, prop, kwargs.get(prop, set())) + else: + setattr(self, prop, kwargs.get(prop)) + + self._diffs = set() + + def __eq__(self, other): + if other is None: + return False + _typecheck(other, CellObject) + for prop in CellObject._all_properties: + if prop in CellObject._incomparable_properties: + continue + if getattr(self, prop) != getattr(other, prop): + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + + for prop in sorted(list(CellObject._all_properties)): + if not getattr(self, prop) is None: + parts.append("%s=%r" % (prop, getattr(self, prop))) + + return "CellObject(" + ", ".join(parts) + ")" + + def compare_to_original(self): + self._diffs = self.compare_to_other(self.original_cellobject, True) + + def compare_to_other(self, other, ignore_original=False): + _typecheck(other, CellObject) + + diffs = set() + + for propname in CellObject._all_properties: + if propname in CellObject._incomparable_properties: + continue + if ignore_original and propname == "original_cellobject": + continue + oval = getattr(other, propname) + sval = getattr(self, propname) + if oval is None and sval is None: + continue + if oval != sval: + #_logger.debug("propname, oval, sval: %r, %r, %r" % (propname, oval, sval)) + diffs.add(propname) + + return diffs + + def populate_from_Element(self, e): + """Populates this CellObject's properties from an ElementTree Element. The Element need not be retained.""" + global _warned_elements + _typecheck(e, (ET.Element, ET.ElementTree)) + + _read_differential_annotations(CellObject._diff_attr_names, e, self.annos) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn in ["cellobject", "original_cellobject", "parent_object"] + + if e.attrib.get("root"): + self.root = e.attrib["root"] + + #Look through direct-child elements for other properties + for ce in e.findall("./*"): + (cns, ctn) = _qsplit(ce.tag) + if ctn == "alloc": + self.alloc = ce.text + elif ctn == "byte_runs": + self.byte_runs = ByteRuns() + self.byte_runs.populate_from_Element(ce) + elif ctn == "cellpath": + self.cellpath = ce.text + elif ctn == "mtime": + self.mtime = TimestampObject() + self.mtime.populate_from_Element(ce) + elif ctn == "name": + self.name = ce.text + elif ctn == "name_type": + self.name_type = ce.text + elif ctn == "original_cellobject": + self.original_cellobject = CellObject() + self.original_cellobject.populate_from_Element(ce) + elif ctn == "parent_object": + self.parent_object = CellObject() + self.parent_object.populate_from_Element(ce) + else: + if (cns, ctn) not in _warned_elements: + _warned_elements.add((cns, ctn)) + _logger.warning("Uncertain what to do with this element: %r" % ce) + + self.sanity_check() + + def sanity_check(self): + if self.name_type and self.name_type != "k": + if self.mtime: + _logger.info("Error occurred sanity-checking this CellObject: %r." % self) + raise ValueError("A Registry Key (node) is the only kind of CellObject that can have a timestamp.") + if self.root: + _logger.info("Error occurred sanity-checking this CellObject: %r." % self) + raise ValueError("A Registry Key (node) is the only kind of CellObject that can have the 'root' attribute.") + + def to_Element(self): + self.sanity_check() + + outel = ET.Element("cellobject") + + annos_whittle_set = copy.deepcopy(self.annos) + diffs_whittle_set = copy.deepcopy(self.diffs) + + for annodiff in CellObject._diff_attr_names: + if annodiff in annos_whittle_set: + outel.attrib[CellObject._diff_attr_names[annodiff]] = "1" + annos_whittle_set.remove(annodiff) + if len(annos_whittle_set) > 0: + _logger.warning("Failed to export some differential annotations: %r." % annos_whittle_set) + + def _anno_change(el): + if el.tag in self.diffs: + el.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(el.tag) + + #Recall that Element text must be a string + def _append_str(name, value): + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element(name) + if not value is None: + tmpel.text = str(value) + _anno_change(tmpel) + outel.append(tmpel) + + def _append_object(name, value): + if not value is None or name in diffs_whittle_set: + if value is None: + tmpel = ET.Element(name) + else: + tmpel = value.to_Element() + _anno_change(tmpel) + outel.append(tmpel) + + #TODO root should be an element too. Revise schema. + if self.root: + outel.attrib["root"] = str(self.root) + + _append_str("cellpath", self.cellpath) + _append_str("name", self.name) + _append_str("name_type", self.name_type) + _append_str("alloc", self.alloc) + _append_object("mtime", self.mtime) + _append_object("byte_runs", self.byte_runs) + _append_object("original_cellobject", self.original_cellobject) + + if len(diffs_whittle_set) > 0: + _logger.warning("Did not annotate all of the differing properties of this file. Remaining properties: %r." % diffs_whittle_set) + + return outel + + def to_regxml(self): + return _ET_tostring(self.to_Element()) + + @property + def alloc(self): + return self._alloc + + @alloc.setter + def alloc(self, val): + self._alloc = _boolcast(val) + + @property + def annos(self): + """Set of differential annotations. Expected members are the keys of this class's _diff_attr_names dictionary.""" + return self._annos + + @annos.setter + def annos(self, val): + _typecheck(val, set) + self._annos = val + + @property + def byte_runs(self): + return self._byte_runs + + @byte_runs.setter + def byte_runs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._byte_runs = val + + @property + def cellpath(self): + return self._cellpath + + @cellpath.setter + def cellpath(self, val): + if not val is None: + _typecheck(val, str) + self._cellpath = val + + @property + def diffs(self): + return self._diffs + + @diffs.setter + def diffs(self, value): + _typecheck(value, set) + self._diffs = value + + @property + def mtime(self): + return self._mtime + + @mtime.setter + def mtime(self, val): + if val is None: + self._mtime = None + elif isinstance(val, TimestampObject): + self._mtime = val + else: + self._mtime = TimestampObject(val, name="mtime") + self.sanity_check() + + @property + def name(self): + return self._name + + @name.setter + def name(self, val): + if not val is None: + _typecheck(val, str) + self._name = val + + @property + def name_type(self): + return self._name_type + + @name_type.setter + def name_type(self, val): + if not val is None: + assert val in ["k", "v"] + self._name_type = val + + @property + def original_cellobject(self): + return self._original_cellobject + + @original_cellobject.setter + def original_cellobject(self, val): + if not val is None: + _typecheck(val, CellObject) + self._original_cellobject = val + + @property + def parent_object(self): + """This object is an extremely sparse CellObject, containing just identifying information. Alternately, it can be an entire object reference to the parent Object, though uniqueness should be checked.""" + return self._parent_object + + @parent_object.setter + def parent_object(self, val): + if not val is None: + _typecheck(val, CellObject) + self._parent_object = val + + @property + def root(self): + return self._root + + @root.setter + def root(self, val): + self._root = _boolcast(val) + + +def iterparse(filename, events=("start","end"), dfxmlobject=None): + """ + Generator. Yields a stream of populated DFXMLObjects, VolumeObjects and FileObjects, paired with an event type ("start" or "end"). The DFXMLObject and VolumeObjects do NOT have their child lists populated with this method - that is left to the calling program. + + The event type interface is meant to match the interface of ElementTree's iterparse; this is simply for familiarity's sake. DFXMLObjects and VolumeObjects are yielded with "start" when the stream of VolumeObject or FileObjects begins - that is, they are yielded after being fully constructed up to the potentially-lengthy child object stream. FileObjects are yielded only with "end". + + @param filename: A string + @param events: Events. Optional. A tuple of strings, containing "start" and/or "end". + @param dfxmlobject: A DFXMLObject document. Optional. A DFXMLObject is created and yielded in the object stream if this argument is not supplied. + """ + + #The DFXML stream file handle. + fh = None + subp = None + subp_command = ["fiwalk", "-x", filename] + if filename.endswith("xml"): + fh = open(filename, "rb") + else: + subp = subprocess.Popen(subp_command, stdout=subprocess.PIPE) + fh = subp.stdout + + _events = set() + for e in events: + if not e in ("start","end"): + raise ValueError("Unexpected event type: %r. Expecting 'start', 'end'." % e) + _events.add(e) + + dobj = dfxmlobject or DFXMLObject() + + #The only way to efficiently populate VolumeObjects is to populate the object when the stream has hit its first FileObject. + vobj = None + + #It doesn't seem ElementTree allows fetching parents of Elements that are incomplete (just hit the "start" event). So, build a volume Element when we've hit "", glomming all elements until the first fileobject is hit. + #Likewise with the Element for the DFXMLObject. + dfxml_proxy = None + volume_proxy = None + + #State machine, used to track when the first fileobject of a volume is encountered. + READING_START = 0 + READING_PRESTREAM = 1 #DFXML metadata, pre-Object stream + READING_VOLUMES = 2 + READING_FILES = 3 + READING_POSTSTREAM = 4 #DFXML metadata, post-Object stream (typically the element) + _state = READING_START + + for (ETevent, elem) in ET.iterparse(fh, events=("start-ns", "start", "end")): + #View the object event stream in debug mode + #_logger.debug("(event, elem) = (%r, %r)" % (ETevent, elem)) + #if ETevent in ("start", "end"): + # _logger.debug("_ET_tostring(elem) = %r" % _ET_tostring(elem)) + + #Track namespaces + if ETevent == "start-ns": + dobj.add_namespace(*elem) + continue + + #Split tag name into namespace and local name + (ns, ln) = _qsplit(elem.tag) + + if ETevent == "start": + if ln == "dfxml": + if _state != READING_START: + raise ValueError("Encountered a element, but the parser isn't in its start state. Recursive declarations aren't supported at this time.") + dfxml_proxy = ET.Element(elem.tag) + for k in elem.attrib: + #Note that xmlns declarations don't appear in elem.attrib. + dfxml_proxy.attrib[k] = elem.attrib[k] + _state = READING_PRESTREAM + elif ln == "volume": + if _state == READING_PRESTREAM: + #Cut; yield DFXMLObject now. + dobj.populate_from_Element(dfxml_proxy) + if "start" in _events: + yield ("start", dobj) + #Start populating a new Volume proxy. + volume_proxy = ET.Element(elem.tag) + for k in elem.attrib: + volume_proxy.attrib[k] = elem.attrib[k] + _state = READING_VOLUMES + elif ln == "fileobject": + if _state == READING_PRESTREAM: + #Cut; yield DFXMLObject now. + dobj.populate_from_Element(dfxml_proxy) + if "start" in _events: + yield ("start", dobj) + elif _state == READING_VOLUMES: + #_logger.debug("Encountered a fileobject while reading volume properties. Yielding volume now.") + #Cut; yield VolumeObject now. + if volume_proxy is not None: + vobj = VolumeObject() + vobj.populate_from_Element(volume_proxy) + if "start" in _events: + yield ("start", vobj) + #Reset + volume_proxy.clear() + volume_proxy = None + _state = READING_FILES + elif ETevent == "end": + if ln == "fileobject": + if _state in (READING_PRESTREAM, READING_POSTSTREAM): + #This particular branch can be reached if there are trailing fileobject elements after the volume element. This would happen if a tool needed to represent files (likely reassembled fragments) found outside all the partitions. + #More frequently, we hit this point when there are no volume groupings. + vobj = None + fi = FileObject() + fi.populate_from_Element(elem) + fi.volume_object = vobj + #_logger.debug("fi = %r" % fi) + if "end" in _events: + yield ("end", fi) + #Reset + elem.clear() + elif elem.tag == "dfxml": + if "end" in _events: + yield ("end", dobj) + elif elem.tag == "volume": + if "end" in _events: + yield ("end", vobj) + _state = READING_POSTSTREAM + elif _state == READING_VOLUMES: + #This is a volume property; glom onto the proxy. + if volume_proxy is not None: + volume_proxy.append(elem) + elif _state == READING_PRESTREAM: + if ln in ["metadata", "creator", "source"]: + #This is a direct child of the DFXML document property; glom onto the proxy. + if dfxml_proxy is not None: + dfxml_proxy.append(elem) + + #If we called Fiwalk, double-check that it exited successfully. + if not subp is None: + _logger.debug("Calling wait() to let the Fiwalk subprocess terminate...") #Just reading from subp.stdout doesn't let the process terminate; it only finishes working. + subp.wait() + if subp.returncode != 0: + e = subprocess.CalledProcessError("There was an error running Fiwalk.") + e.returncode = subp.returncode + e.cmd = subp_command + raise e + _logger.debug("...Done.") + +def parse(filename): + """Returns a DFXMLObject populated from the contents of the (string) filename argument.""" + retval = None + appender = None + for (event, obj) in iterparse(filename): + if event == "start": + if isinstance(obj, DFXMLObject): + retval = obj + appender = obj + elif isinstance(obj, VolumeObject): + retval.append(obj) + appender = obj + elif event == "end": + if isinstance(obj, VolumeObject): + appender = retval + elif isinstance(obj, FileObject): + appender.append(obj) + return retval + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + + logging.basicConfig(level=logging.DEBUG) + #Run unit tests + + assert _intcast(-1) == -1 + assert _intcast("-1") == -1 + assert _qsplit("{http://www.w3.org/2001/XMLSchema}all") == ("http://www.w3.org/2001/XMLSchema","all") + assert _qsplit("http://www.w3.org/2001/XMLSchema}all") == (None, "http://www.w3.org/2001/XMLSchema}all") + + + fi = FileObject() + + #Check property setting + fi.mtime = "1999-12-31T23:59:59Z" + _logger.debug("fi = %r" % fi) + + #Check bad property setting + failed = None + try: + fi.mtime = "Not a timestamp" + failed = False + except: + failed = True + _logger.debug("fi = %r" % fi) + _logger.debug("failed = %r" % failed) + assert failed + + t0 = TimestampObject(prec="100ns", name="mtime") + _logger.debug("t0 = %r" % t0) + assert t0.prec[0] == 100 + assert t0.prec[1] == "ns" + t1 = TimestampObject("2009-01-23T01:23:45Z", prec="2", name="atime") + _logger.debug("t1 = %r" % t1) + assert t1.prec[0] == 2 + assert t1.prec[1] == "s" + + print("Unit tests passed.") diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/report_silent_changes.py tcpflow-1.4.5+repack1/src/dfxml/python/report_silent_changes.py --- tcpflow-1.4.4+repack1/src/dfxml/python/report_silent_changes.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/report_silent_changes.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +""" +This program takes a differentially-annotated DFXML file as input, and outputs a DFXML document that contains 'Silent' changes. For instance, a changed checksum with no changed timestamps would be 'Silent.' +""" + +__version__ = "0.1.0" + +import os +import logging +import Objects +import make_differential_dfxml + +_logger = logging.getLogger(os.path.basename(__file__)) + +def main(): + d = Objects.DFXMLObject() + current_appender = d + tally = 0 + for (event, obj) in Objects.iterparse(args.infile): + if event == "start": + #Inherit namespaces + if isinstance(obj, Objects.DFXMLObject): + for (prefix, url) in obj.iter_namespaces(): + d.add_namespace(prefix, url) + #Group files by volume + elif isinstance(obj, Objects.VolumeObject): + d.append(obj) + current_appender = obj + elif event == "end": + if isinstance(obj, Objects.VolumeObject): + current_appender = d + elif isinstance(obj, Objects.FileObject): + if "_changed" not in obj.diffs: + if "_modified" in obj.diffs or "_renamed" in obj.diffs: + current_appender.append(obj) + tally += 1 + print(d.to_dfxml()) + _logger.info("Found %d suspiciously-changed files." % tally) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("infile") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + if not args.infile.endswith("xml"): + raise Exception("Input file should be a DFXML file, and should end with 'xml': %r." % args.infile) + + if not os.path.exists(args.infile): + raise Exception("Input file does not exist: %r." % args.infile) + + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/summarize_differential_dfxml.py tcpflow-1.4.5+repack1/src/dfxml/python/summarize_differential_dfxml.py --- tcpflow-1.4.4+repack1/src/dfxml/python/summarize_differential_dfxml.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/summarize_differential_dfxml.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,336 @@ +#!/usr/bin/env python3 + +__version__ = "0.8.2" + +import os +import logging +import Objects +import idifference +import copy +import collections +import make_differential_dfxml +import operator + +_logger = logging.getLogger(os.path.basename(__file__)) + +#Only issue a potentially verbose warning once +_nagged_timestamp_format = False + +class FOCounter(object): + "Counter for FileObjects. Does not count differences (differential annotations)." + + def __init__(self): + self._inodes = set() + self._fo_tally = 0 + self._fo_unalloc_unmatch_tally = 0 + self._fo_allocation_tallies_inode = {True:0, False:0, None:0} + self._fo_allocation_tallies_name = {True:0, False:0, None:0} + + def add(self, obj): + assert isinstance(obj, Objects.FileObject) + self._inodes.add((obj.partition, obj.inode)) + self._fo_tally += 1 + + self._fo_allocation_tallies_inode[obj.alloc_inode] += 1 + self._fo_allocation_tallies_name[obj.alloc_name] += 1 + if not (obj.alloc_name and obj.alloc_inode) and obj.original_fileobject is None: + self._fo_unalloc_unmatch_tally += 1 + + @property + def inode_tally(self): + return len(self._inodes) + + @property + def fo_tally(self): + return self._fo_tally + + @property + def fo_unalloc_unmatch_tally(self): + return self._fo_unalloc_unmatch_tally + + @property + def fo_tally_alloc_inode(self): + return self._fo_allocation_tallies_inode[True] + + @property + def fo_tally_alloc_name(self): + return self._fo_allocation_tallies_name[True] + + @property + def fo_tally_nullalloc_inode(self): + return self._fo_allocation_tallies_inode[None] + + @property + def fo_tally_nullalloc_name(self): + return self._fo_allocation_tallies_name[None] + + @property + def fo_tally_unalloc_inode(self): + return self._fo_allocation_tallies_inode[False] + + @property + def fo_tally_unalloc_name(self): + return self._fo_allocation_tallies_name[False] + +def report(dfxmlobject, sort_by=None, summary=None, timestamp=None): + new_files = [] + deleted_files = [] + deleted_files_matched = [] + deleted_files_unmatched = [] + renamed_files = [] + renamed_files_directory = [] + renamed_files_regular = [] + renamed_files_other = [] + renamed_files_type_changed = [] + renamed_files_type_changes = collections.defaultdict(int) #Key: (old name_type, new name_type); value: counter + renamed_files_content_matches = [] + modified_files = [] + changed_files = [] + unchanged_files = [] + + obj_alloc_counters = [FOCounter(), FOCounter()] + matched_files_tally = 0 + + def _is_matched(obj): + _matched = "matched" in obj.annos + return _matched + + #Group objects by differential annotations + for obj in dfxmlobject: + if isinstance(obj, Objects.FileObject): + if "matched" in obj.annos: + matched_files_tally += 1 + + #_logger.debug("Inspecting %s for changes" % obj) + if "new" in obj.annos: + new_files.append(obj) + elif "deleted" in obj.annos: + deleted_files.append(obj) + if _is_matched(obj): + deleted_files_matched.append(obj) + else: + deleted_files_unmatched.append(obj) + elif "renamed" in obj.annos: + #Count content matches + if obj.original_fileobject.sha1 == obj.sha1: + renamed_files_content_matches.append(obj) + + renamed_files.append(obj) + if obj.name_type != obj.original_fileobject.name_type: + renamed_files_type_changed.append(obj) + renamed_files_type_changes[(obj.original_fileobject.name_type or "", obj.name_type or "")] += 1 + elif obj.name_type == "r": + renamed_files_regular.append(obj) + elif obj.name_type == "d": + renamed_files_directory.append(obj) + else: + renamed_files_other.append(obj) + elif "modified" in obj.annos: + modified_files.append(obj) + elif "changed" in obj.annos: + changed_files.append(obj) + else: + unchanged_files.append(obj) + + #Count files of the post image + if "deleted" in obj.annos: + #Don't count the "Ghost" files created for deleted files that weren't matched between images + if _is_matched(obj): + obj_alloc_counters[1].add(obj) + else: + obj_alloc_counters[1].add(obj) + #Count files of the baseline image + if obj.original_fileobject: + obj_alloc_counters[0].add(obj.original_fileobject) + elif isinstance(obj, Objects.VolumeObject): + #TODO + pass + + def _sortkey_singlefi(): + """Return a sorting key function, fit for use in sorted() on a list of FileObjects.""" + def _key_by_path(fi): + return ( + fi.filename or "", + str(fi.mtime) or "n/a", + (fi.original_fileobject and fi.original_fileobject.filename) or "", + (fi.original_fileobject and str(fi.original_fileobject.mtime)) or "n/a" + ) + def _key_by_times(fi): + return ( + str(fi.mtime) or "n/a", + str(fi.crtime) or "n/a", + fi.filename, + (fi.original_fileobject and str(fi.original_fileobject.mtime)) or "n/a", + (fi.original_fileobject and str(fi.original_fileobject.crtime)) or "n/a", + (fi.original_fileobject and fi.original_fileobject.filename) or "" + ) + if sort_by == "path": + return _key_by_path + else: #Default: "times" + return _key_by_times + + def _format_timestamp(t): + """Takes a timestamp, returns a string.""" + if t is None: + return "n/a" + if timestamp: + if t.timestamp: + return str(t.timestamp) + else: + if not _nagged_timestamp_format: + _nagged_timestamp_format = True + _logger.warning("Tried to format a Unix timestamp, but failed.") + return "n/a" + else: + return str(t) + + idifference.h2("New files:") + new_files_sorted = sorted(new_files, key=_sortkey_singlefi()) + res = [(_format_timestamp(obj.mtime), obj.filename or "", obj.filesize) for obj in new_files_sorted] + idifference.table(res) + + idifference.h2("Deleted files:") + deleted_files_sorted = sorted(deleted_files, key=_sortkey_singlefi()) + res = [( + obj.original_fileobject.mtime, + obj.original_fileobject.filename or "", + obj.original_fileobject.filesize + ) for obj in deleted_files_sorted] + idifference.table(res) + + def _sortkey_renames(): + def _key_by_path(fi): + return ( + fi.original_fileobject.filename or "", + fi.filename or "", + str(fi.mtime) or "", + str(fi.original_fileobject.mtime) or "" + ) + def _key_by_times(fi): + return ( + str(fi.mtime) or "n/a", + str(fi.ctime) or "n/a", + str(fi.atime) or "n/a", + str(fi.dtime) or "n/a", + str(fi.crtime) or "n/a", + fi.original_fileobject.filename or "", + fi.filename or "" + ) + if sort_by == "path": + return _key_by_path + else: #Default: "times" + return _key_by_times + + def _enumerated_changes(filelist): + res = [] + for fi in filelist: + diffs_remaining = copy.deepcopy(fi.diffs) + if "filename" in diffs_remaining: + diffs_remaining -= {"filename"} + res.append(("Renamed", "", fi.original_fileobject.filename, "renamed to", fi.filename)) + for timeattr in Objects.TimestampObject.timestamp_name_list: + if timeattr in diffs_remaining: + diffs_remaining -= {timeattr} + res.append(( + fi.filename or "", + "%s changed, " % timeattr, + _format_timestamp(getattr(fi.original_fileobject, timeattr)), + "->", + _format_timestamp(getattr(fi, timeattr)) + )) + for diff in sorted(diffs_remaining): + diffs_remaining -= {diff} + res.append(( + fi.filename or "", + "%s changed, " % diff, + getattr(fi.original_fileobject, diff) or "" + "->", + getattr(fi, diff) or "", + )) + return res + + idifference.h2("Renamed files:") + renamed_files_sorted = sorted(renamed_files, key=_sortkey_renames()) + res = _enumerated_changes(renamed_files_sorted) + idifference.table(res, break_on_change=True) + + idifference.h2("Files with modified contents:") + modified_files_sorted = sorted(modified_files, key=_sortkey_singlefi()) + res = _enumerated_changes(modified_files_sorted) + idifference.table(res, break_on_change=True) + + idifference.h2("Files with changed properties:") + changed_files_sorted = sorted(changed_files, key=_sortkey_singlefi()) + res = _enumerated_changes(changed_files_sorted) + idifference.table(res, break_on_change=True) + + if summary: + idifference.h2("Summary:") + summ_recs = [ + ("Prior image's file (file object) tally", str(obj_alloc_counters[0].fo_tally)), + (" Inode allocation", ""), + (" Allocated", str(obj_alloc_counters[0].fo_tally_alloc_inode)), + (" Unallocated", str(obj_alloc_counters[0].fo_tally_unalloc_inode)), + (" Unknown", str(obj_alloc_counters[0].fo_tally_nullalloc_inode)), + (" Name allocation", ""), + (" Allocated", str(obj_alloc_counters[0].fo_tally_alloc_name)), + (" Unallocated", str(obj_alloc_counters[0].fo_tally_unalloc_name)), + (" Unknown", str(obj_alloc_counters[0].fo_tally_nullalloc_name)), + (" Unallocated, unmatched", obj_alloc_counters[0].fo_unalloc_unmatch_tally), + ("Prior image's file (inode) tally", str(obj_alloc_counters[0].inode_tally)), + ("Current image's file (file object) tally", str(obj_alloc_counters[1].fo_tally)), + (" Inode allocation", ""), + (" Allocated", str(obj_alloc_counters[1].fo_tally_alloc_inode)), + (" Unallocated", str(obj_alloc_counters[1].fo_tally_unalloc_inode)), + (" Unknown", str(obj_alloc_counters[1].fo_tally_nullalloc_inode)), + (" Name allocation", ""), + (" Allocated", str(obj_alloc_counters[1].fo_tally_alloc_name)), + (" Unallocated", str(obj_alloc_counters[1].fo_tally_unalloc_name)), + (" Unknown", str(obj_alloc_counters[1].fo_tally_nullalloc_name)), + (" Unallocated, unmatched", obj_alloc_counters[1].fo_unalloc_unmatch_tally), + ("Current image's file (inode) tally", str(obj_alloc_counters[1].inode_tally)), + ("Matched files", str(matched_files_tally)), + ("", ""), + ("New files", str(len(new_files))), + ("Deleted files", str(len(deleted_files))), + (" Unmatched", str(len(deleted_files_unmatched))), + (" Matched", str(len(deleted_files_matched))), + ("Renamed files", str(len(renamed_files))), + (" Directories", str(len(renamed_files_directory))), + (" Regular files", str(len(renamed_files_regular))), + (" Other", str(len(renamed_files_other))), + (" Type changed", str(len(renamed_files_type_changed))), + ] + for key in sorted(renamed_files_type_changes.keys()): + summ_recs.append((" %s -> %s" % key, str(renamed_files_type_changes[key]))) + summ_recs += [ + (" Content matches", str(len(renamed_files_content_matches))), + ("Files with modified content", str(len(modified_files))), + ("Files with changed file properties", str(len(changed_files))) + ] + + idifference.table(summ_recs) + +def main(): + global args + dfxmlobject = Objects.parse(args.infile) + report(dfxmlobject, sort_by=args.sort_by, summary=args.summary) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + parser.add_argument("--sort-by", help="Sorts file lists. Pass one of these arguments: \"times\" or \"path\".") + parser.add_argument("--summary",help="output summary statistics of file system changes",action="store_true", default=False) + parser.add_argument("infile", help="A differential DFXML file. Should include the optional 'delta:matched' attributes for counts to work correctly.") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + if not args.infile.endswith("xml"): + raise Exception("Input file should be a DFXML file, and should end with 'xml': %r." % args.infile) + + if not os.path.exists(args.infile): + raise Exception("Input file does not exist: %r." % args.infile) + + main() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_dfxml_tool.sh tcpflow-1.4.5+repack1/src/dfxml/python/test_dfxml_tool.sh --- tcpflow-1.4.4+repack1/src/dfxml/python/test_dfxml_tool.sh 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_dfxml_tool.sh 2015-08-26 03:35:59.000000000 +0000 @@ -25,7 +25,7 @@ iter=0 for x in "${DT_OPTIONS[@]}"; do echo "Iteration $iter: Testing $x" >&2 - "$PYTHON2" dfxml_tool.py $x .. > dfxml_tool_p2_${iter}.dfxml + "$PYTHON2" dfxml_tool.py $x ../src > dfxml_tool_p2_${iter}.dfxml # "$PYTHON3" dfxml_tool.py "--$x" .. > dfxml_tool_p3_${iter}.dfxml iter=$(($iter+1)) done diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_mac_timelines.sh tcpflow-1.4.5+repack1/src/dfxml/python/test_mac_timelines.sh --- tcpflow-1.4.4+repack1/src/dfxml/python/test_mac_timelines.sh 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_mac_timelines.sh 2015-08-26 03:35:59.000000000 +0000 @@ -19,8 +19,17 @@ "$PYTHON3" demo_mac_timeline_iter.py ../samples/simple.xml >demo_mac_timeline_iter_simple_p3.txt test 12 == $(cat demo_mac_timeline_iter_simple_p3.txt | wc -l) +"$PYTHON2" demo_mac_timeline_objects.py ../samples/simple.xml >demo_mac_timeline_objects_simple_p2.txt +test 12 == $(cat demo_mac_timeline_iter_simple_p2.txt | wc -l) + +"$PYTHON3" demo_mac_timeline_objects.py ../samples/simple.xml >demo_mac_timeline_objects_simple_p3.txt +test 12 == $(cat demo_mac_timeline_iter_simple_p3.txt | wc -l) + "$PYTHON3" demo_mac_timeline.py ../samples/difference_test_1.xml >demo_mac_timeline_dt1.txt test 9 == $(cat demo_mac_timeline_dt1.txt | wc -l) "$PYTHON3" demo_mac_timeline_iter.py ../samples/difference_test_1.xml >demo_mac_timeline_iter_dt1.txt test 9 == $(cat demo_mac_timeline_iter_dt1.txt | wc -l) + +"$PYTHON3" demo_mac_timeline_objects.py ../samples/difference_test_1.xml >demo_mac_timeline_objects_dt1.txt +test 9 == $(cat demo_mac_timeline_objects_dt1.txt | wc -l) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/dfxml.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/dfxml.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/dfxml.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/dfxml.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,1689 @@ +#!/usr/bin/env python +# +# dfxml.py +# Digital Forensics XML classes + +"""Digital Forensics XML classes. +This module contains a number of classes for dealing with dfxml files, both using +the XML DOM model and using the EXPAT model. + +The following moduel functions are defined: + isone(x) - returns true if something is equal to 1 (useful for 1 + safeInt(x) - converts something to an int but never raises an exception + +The following classes are defined in this module: + byte_run - the class for representing a run on the disk + dftime - represents time. Can be in either Unix timestamp or ISO8601. + Interconverts as necessary. + fileobject - represents a DFXML fileobject. + + +byte_runs() is function that returns an array of byterun objects. +Each object has the attributes: + file_offset - offset from the beginning of the file + img_offset - offset from the beginning of the image + len - the number of bytes + fs_offset - offset from the beginning of the file system + +where encoding, if present, is 0 for raw, 1 for NTFS compressed. + +""" +import sys +import re +from sys import stderr +from subprocess import Popen,PIPE +import base64 +import hashlib +import os + +import datetime + +import logging +_logger = logging.getLogger(os.path.basename(__file__)) + +__version__ = "1.0.1" + +tsk_virtual_filenames = set(['$FAT1','$FAT2']) + +XMLNS_DC = "http://purl.org/dc/elements/1.1/" +XMLNS_DFXML = "http://www.forensicswiki.org/wiki/Category:Digital_Forensics_XML" +XMLNS_DELTA = "http://www.forensicswiki.org/wiki/Forensic_Disk_Differencing" + +def isone(x): + """Return true if something is one (number or string)""" + try: + return int(x)==1; + except TypeError: + return False + +def safeInt(x): + """Return an integer or False. False is returned, rather than None, because you can + divide False by 3 but you can't divide None by 3. + + NOTE: This function could be written as: + + def safeInt(x): + return int(x) if x else False + + but that doesn't work on older version of Python.""" + if x: return int(x) + return False + +def timestamp2iso8601(ts): + import time + return time.strftime("%FT%TZ",time.gmtime(ts)) + +from datetime import tzinfo,timedelta +class GMTMIN(tzinfo): + def __init__(self,minoffset): # DST starts last Sunday in March + self.minoffset = minoffset + def utcoffset(self, dt): + return timedelta(minutes=self.minoffset) + def dst(self, dt): + return timedelta(0) + def tzname(self,dt): + return "GMT+%02d%02d" % (self.minoffset/60,self.minoffset%60) + +def parse_iso8601(ts): + Z = ts.find('Z') + if Z>0: + return datetime.datetime.strptime(ts[:Z],"%Y-%m-%dT%H:%M:%S") + raise RuntimeError("parse_iso8601: ISO8601 format {} not recognized".format(ts)) + + +rx_iso8601 = re.compile("(\d\d\d\d)-(\d\d)-(\d\d)[T ](\d\d):(\d\d):(\d\d)(\.\d+)?(Z|[-+]\d\d:?\d\d)?") +def iso8601Tdatetime(s): + """SLG's conversion of ISO8601 to datetime""" + m = rx_iso8601.search(s) + if not m: + raise ValueError("Cannot parse: "+s) + # Get the microseconds + try: + microseconds = int(float(m.group(7)) * 1000000) + except TypeError: + microseconds = 0 + # Figure tz offset + offset = None + minoffset = None + if m.group(8): + if m.group(8)=="Z": + minoffset = 0 + elif m.group(8)[0:1] in "-+": + minoffset = int(m.group(8)[0:3]) * 60 + int(m.group(8)[-2:]) + z = s.find("Z") + if z>=0: + offset = 0 + # Build the response + if minoffset: + return datetime.datetime(int(m.group(1)),int(m.group(2)),int(m.group(3)), + int(m.group(4)),int(m.group(5)),int(m.group(6)), + microseconds,GMTMIN(minoffset)) + elif offset: + return datetime.datetime(int(m.group(1)),int(m.group(2)),int(m.group(3)), + int(m.group(4)),int(m.group(5)),int(m.group(6)), + microseconds,GMTMIN(offset)) + else: + return datetime.datetime(int(m.group(1)),int(m.group(2)),int(m.group(3)), + int(m.group(4)),int(m.group(5)),int(m.group(6)), + microseconds) + +#This format is as specified in RFC 822, section 5.1, and matches the adjustments in RFC 1123, section 5.2.14. It appears in email and HTTP headers. +rx_rfc822datetime = re.compile("(?P\d{1,2}) (?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (?P\d{4}) (?P\d\d):(?P\d\d):(?P\d\d) (?PZ|[-+]\d\d:?\d\d)") +three_letter_month_dict = { + "Jan": 1, + "Feb": 2, + "Mar": 3, + "Apr": 4, + "May": 5, + "Jun": 6, + "Jul": 7, + "Aug": 8, + "Sep": 9, + "Oct": 10, + "Nov": 11, + "Dec": 12 +} +def rfc822Tdatetime(s): + """ + AJN's conversion of times occurring in RFC 822 data to datetime. + Follows SLG's pattern. + """ + m = rx_rfc822datetime.search(s) + if not m: + raise ValueError("Cannot parse as an RFC 822 timestamp: %r." % s) + mgd = m.groupdict() + # Figure tz offset + offset = None + minoffset = None + match_timezone = mgd.get("timezone") + if match_timezone: + if match_timezone == "Z": + minoffset = 0 + elif match_timezone[0] in "-+": + minoffset = int(match_timezone[0:-2]) * 60 + int(match_timezone[-2:]) + #TODO Find a reason to use the 'offset' variable? (Hour offset, vs. minute offset?) + if minoffset: + return datetime.datetime( + int(mgd["year"]), + three_letter_month_dict[mgd["month"]], + int(mgd["day"]), + int(mgd["hours"]), + int(mgd["minutes"]), + int(mgd["seconds"]), + 0, + GMTMIN(minoffset) + ) + else: + return datetime.datetime( + int(mgd["year"]), + three_letter_month_dict[mgd["month"]], + int(mgd["day"]), + int(mgd["hours"]), + int(mgd["minutes"]), + int(mgd["seconds"]), + 0 + ) + +################################################################ +### +### byte_run class +### + +class byte_run: + """The internal representation for a byte run. + + byte_runs have the following attributes: + .img_offset = offset of the byte run from the image start, in bytes + .len = the length of the run, in bytes (prevoiusly called 'bytes') + .sector_size = sector size of the underlying media + + Originally this was an array, + which is faster than an attributed object. But this approach is more expandable, + and it's only 70% the speed of an array under Python3.0. + + Note that Python 3 removed the __cmp__ class method: + + """ + # declaring slots prevents other attributes from appearing, + # but that prevents the code from working with new XML that has new fields. + # __slots__ = ["file_offset","img_offset","len","fill","sector_size"] + def __init__(self,img_offset=None,len=None,file_offset=None): + self.img_offset = img_offset + self.file_offset = file_offset + self.len = len + self.sector_size = 512 # default + self.hashdigest = dict() # + + def __lt__(self,other): + if self.img_offset is not None and other.img_offset is not None: + return self.img_offset < other.img_offset + elif self.file_offset is not None and other.file_offset is not None: + return self.file_offset < other.file_offset + else: + raise ValueError("Byte run objects are incomparable") + + def __eq__(self,other): + if self.img_offset is not None and other.img_offset is not None: + return self.img_offset == other.img_offset + elif self.file_offset is not None and other.file_offset is not None: + return self.file_offset == other.file_offset + else: + raise ValueError("Byte run objects are incomparable") + + def __str__(self): + try: + return "byte_run[img_offset={0}; file_offset={1} len={2}] ".format( + self.img_offset,self.file_offset,self.len) + except (AttributeError, TypeError): + #Catch attributes that are missing or mis-typed (e.g. NoneType) + pass + try: + return "byte_run[file_offset={0}; fill={1}; len={2}]".format( + self.file_offset,self.fill,self.len) + except AttributeError: + pass + try: + return "byte_run[file_offset={0}; uncompressed_len={1}]".format( + self.file_offset,self.uncompressed_len) + except AttributeError: + return "byte_run"+str(dir(self)) + + def start_sector(self): + return self.img_offset // self.sector_size + + def sector_count(self): + return self.len // self.sector_size + + def has_sector(self,s): + if self.sector_size==0: + raise ValueError("%s: sector_size cannot be 0" % (self)) + try: + return self.img_offset <= s * self.sector_size < self.img_offset+self.len + except AttributeError: + # Doesn't have necessary attributes to answer true. + # Usually this happens with runs of a constant value + return False + + def extra_len(self): + return self.len % self.sector_size + + def decode_xml_attributes(self,attr): + for (key,value) in attr.items(): + try: + setattr(self,key,int(value)) + except ValueError: + setattr(self,key,value) + + + def decode_sax_attributes(self,attr): + for (key,value) in attr.items(): + if key=='bytes': key=='len' # tag changed name; provide backwards compatiability + try: + setattr(self,key,int(value)) + except ValueError: + setattr(self,key,value) + +class ComparableMixin(object): + """ + Comparator "Abstract" class. Classes inheriting this must define a _cmpkey() method. + + Credit to Lennart Regebro for the total implementation of this class, found equivalently from: + http://regebro.wordpress.com/2010/12/13/python-implementing-rich-comparison-the-correct-way/ + http://stackoverflow.com/questions/6907323/comparable-classes-in-python-3/6913420#6913420 + """ + def _compare(self, other, method): + try: + return method(self._cmpkey(), other._cmpkey()) + except (AttributeError, TypeError): + # _cmpkey not implemented, or return different type, + # so I can't compare with "other". + return NotImplemented + + def __lt__(self, other): + return self._compare(other, lambda s, o: s < o) + + def __le__(self, other): + return self._compare(other, lambda s, o: s <= o) + + def __eq__(self, other): + return self._compare(other, lambda s, o: s == o) + + def __ge__(self, other): + return self._compare(other, lambda s, o: s >= o) + + def __gt__(self, other): + return self._compare(other, lambda s, o: s > o) + + def __ne__(self, other): + return self._compare(other, lambda s, o: s != o) + +class dftime(ComparableMixin): + """Represents a DFXML time. Automatically converts between representations and caches the + results as necessary..""" + UTC = GMTMIN(0) + + def ts2datetime(self,ts): + import datetime + return datetime.datetime.utcfromtimestamp(ts).replace(tzinfo=dftime.UTC) + + def __init__(self,val): + #'unicode' is not a type in Python 3; 'basestring' is not a type in Python 2. + if sys.version_info >= (3,0): + _basestring = str + else: + _basestring = basestring + if isinstance(val, str) or isinstance(val,_basestring): + # + #Test for ISO 8601 format - "YYYY-MM-DD" should have hyphen at val[4] + if len(val)>5 and val[4]=="-": + self.iso8601_ = val + elif len(val) > 15 and ":" in val[13:15]: + #Maybe the data are instead the timestamp format found in email headers? + #(The check for 13:15 gets the 14th and 15th characters, since the day can be single- or double-digit.) + self.datetime_ = rfc822Tdatetime(val) + else: + #Maybe the data are a string-wrapped int or float? + #If this fails, the string format is completely unexpected, so just raise an error. + self.timestamp_ = float(val) + elif type(val)==int or type(val)==float: + self.timestamp_ = val + elif isinstance(val, datetime.datetime): + self.datetime_ = val + #TODO Unit-test this with a timezone-less datetime + elif val==None: + self.timestamp_ = None + self.iso8601_ = None + elif isinstance(val, dftime): + #If we instead use .timestamp_, we risk having a timezone conversion error + self.iso8601_ = val.iso8601() + else: + raise ValueError("Unknown type '%s' for DFXML time value" % (str(type(val)))) + def __str__(self): + return self.iso8601() or "" + def __repr__(self): + return repr(self.iso8601()) or "None" + def __le__(self,b): + if b is None: return None + return self.iso8601().__le__(b.iso8601()) + def __gt__(self,b): + if b is None: return None + return self.iso8601().__gt__(b.iso8601()) + def _cmpkey(self): + """Provide a key to use for comparisons; for use with ComparableMixin parent class.""" + return self.timestamp() + + def __eq__(self,b): + if b == None: + #This will always be False - if self were None, we wouldn't be in this __eq__ method. + return False + return self.timestamp()==b.timestamp() + + def iso8601(self): + # Do we have a cached representation? + import time + try: + return self.iso8601_ + except AttributeError: + pass + + # Do we have a datetime representation? + try: + self.iso8601_ = self.datetime_.isoformat() + return self.iso8601_ + except AttributeError: + # We better have a Unix timestamp representation? + self.iso8601_ = time.strftime("%Y-%m-%dT%H:%M:%SZ",time.gmtime(self.timestamp_)) + return self.iso8601_ + + def timestamp(self): + import time + # Do we have a cached representation? + try: + return self.timestamp_ + except AttributeError: + pass + + # Do we have a datetime_ object? + try: + self.timestamp_ = time.mktime(self.datetime_.timetuple()) + return self.timestamp_ + except AttributeError: + self.datetime_ = iso8601Tdatetime(self.iso8601_) + self.timestamp_ = time.mktime(self.datetime_.timetuple()) + return self.timestamp_ + + def datetime(self): + import datetime + # return the datetime from parsing either iso8601 or from parsing timestamp + try: + self.datetime_ = self.ts2datetime(self.timestamp_) + # This needs to be in UTC offset. How annoying. + return self.datetime_ + except AttributeError: + self.datetime_ = iso8601Tdatetime(self.iso8601_) + return self.datetime_ + +class registry_object: + def __init__(self): + self.object_index = {} + self._mtime = None + + """Keep handy a handle on the registry object""" + self.registry_handle = self + + def mtime(self): + return self._mtime + +class registry_cell_object: + def __init__(self): + self._byte_runs = [] + + """This is a pointer to a registry_key_object. The root node has no parent key.""" + self.parent_key = None + + self._name = None + + self._full_path = None + + """Keys have two types: "root" (0x2c,0xac) and not-root. Values have several more types.""" + self._type = None + + """Keep handy a handle on the registry object""" + self.registry_handle = None + + """Name the cell type, for str() and repr().""" + self._cell_type = "(undefined cell object type)" + + """Only applicable to values.""" + self._sha1 = None + + def name(self): + """This is the name of the present key or value.""" + return self._name + + def full_path(self): + """ + This is the full path from the root of the hive, with keys acting like directories and the value name acting like the basename. + + Unlike DFXML, registry paths are delimited with a backslash due to the forward slash being a legal and commonly observed character in cell names. + """ + return self._full_path + + def type(self): + """ + This is the data type of the cell. Keys can be root or not-root; values have several types, like UTF-8, binary, etc. + Presently, this exports as a string representation of the type, not the numeric type code. + """ + return self._type + + def _myname(self): + """This function is called by repr and str, due to (vague memories of) the possibility of an infinite loop if __repr__ calls __self__.""" + if len(self._byte_runs) > 0: + addr = str(self._byte_runs[0].file_offset) + else: + addr = "(unknown)" + return "".join(["<", self._cell_type, " for hive file offset ", addr, ">"]) + + def __repr__(self): + return self._myname() + def __str__(self): + return self._myname() + + def mtime(self): + raise NotImplementedError("registry_cell_object.mtime() not over-ridden!") + + def byte_runs(self): + """Returns a sorted array of byte_run objects.""" + #If this idiom is confusing, see: http://henry.precheur.org/python/copy_list + ret = list(self._byte_runs) + return ret + + def sha1(self): + """ + Return None. Meant to be overwritten. + """ + return None + + def md5(self): + """ + Return None. Meant to be overwritten. + """ + return None + +class registry_key_object(registry_cell_object): + def __init__(self): + registry_cell_object.__init__(self) + self._mtime = None + self.values = {} + self.used = True #TODO Add toggling logic for when hivexml (eventually) processes recovered keys + self._cell_type = "registry_key_object" + def mtime(self): + return self._mtime + def root(self): + if self.type() is None: + return None + return self.type() == "root" + +class registry_value_object(registry_cell_object): + def __init__(self): + registry_cell_object.__init__(self) + self.value_data = None + + self._cell_type = "registry_value_object" + + #TODO Replace to be in line with fileobjects: fileobject.hashdigest is a dictionary + self._hashcache = dict() + + """List for the string-list type of value.""" + self.strings = None + + def mtime(self): + """Return nothing. Alternatively, we might return mtime of parent key in the future.""" + return None + # if self.parent_key: + # return self.parent_key.mtime() + # else: + # return None + + def _hash(self, hashfunc): + """ + Return cached hash, populating cache if necessary. + hashfunc expected values: The functions hashlib.sha1, hashlib.md5. + If self.value_data is None, or there are no strings in a "string-list" type, this should return None. + Interpretation: Registry values of type "string-list" are hashed by feeding each element of the list into the hash .update() function. All other Registry values are fed in the same way, as a 1-element list. + For example, a string type value cell with data "a" fed into this function returns md5("a") (if hashlib.md5 were requested). A string-list type value cell with data ["a","b"] returns md5("ab"). + This is a simplification to deal with Registry string encodings, and may change in the future. + """ + if self._hashcache.get(repr(hashfunc)) is None: + feed_list = [] + if self.type() == "string-list": + feed_list = self.strings + elif not self.value_data is None: + feed_list.append(self.value_data) + #Normalize to hash .update() required type + for (elemindex, elem) in enumerate(feed_list): + if type(elem) == type(""): + #String data take a little extra care: + #"The bytes in your ... file are being automatically decoded to Unicode by Python 3 as you read from the file" + #http://stackoverflow.com/a/7778340/1207160 + feed_list[elemindex] = elem.encode("utf-8") + #Hash if there's data to hash + if len(feed_list) > 0: + h = hashfunc() + for elem in feed_list: + h.update(elem) + self._hashcache[repr(hashfunc)] = h.hexdigest() + return self._hashcache.get(repr(hashfunc)) + + def sha1(self): + return self._hash(hashlib.sha1) + + def md5(self): + return self._hash(hashlib.md5) + +class fileobject: + """The base class for file objects created either through XML DOM or EXPAT""" + TIMETAGLIST=['atime','mtime','ctime','dtime','crtime'] + + def __init__(self,imagefile=None): + self.imagefile = imagefile + self.hashdigest = dict() + + def __str__(self): + try: + fn = self.filename() + except KeyError: + fn = "???" + return "fileobject %s byte_runs: %s" % (fn, " ".join([str(x) for x in self.byte_runs()])) + + def partition(self): + """Partion number of the file""" + return self.tag("partition") + + def filename(self): + """Complement name of the file (sometimes called pathname)""" + return self.tag("filename") + + def ext(self): + """Extension, as a lowercase string without the leading '.'""" + import string + (base,ext) = os.path.splitext(self.filename()) + if ext == '': + return None + else: + return ext[1:] + + def filesize(self): + """Size of the file, in bytes""" + return safeInt(self.tag("filesize")) + + def uid(self): + """UID of the file""" + return safeInt(self.tag("uid")) + + def gid(self): + """GID of the file""" + return safeInt(self.tag("gid")) + + def meta_type(self): + """Meta-type of the file""" + return safeInt(self.tag("meta_type")) + + def mode(self): + """Mode of the file""" + return safeInt(self.tag("mode")) + + def ctime(self): + """Metadata Change Time (sometimes Creation Time), as number of seconds + since January 1, 1970 (Unix time)""" + t = self.tag("ctime") + if t: return dftime(t) + return None + + def atime(self): + """Access time, as number of seconds since January 1, 1970 (Unix time)""" + t = self.tag("atime") + if t: return dftime(t) + return None + + def crtime(self): + """CR time, as number of seconds since January 1, 1970 (Unix time)""" + t = self.tag("crtime") + if t: return dftime(t) + return None + + def mtime(self): + """Modify time, as number of seconds since January 1, 1970 (Unix time)""" + t = self.tag("mtime") + if t: return dftime(t) + return None + + def dtime(self): + """ext2 dtime""" + t = self.tag("dtime") + if t: return dftime(t) + return None + + def times(self): + """Return a dictionary of all times that the system has""" + ret = {} + for tag in self.TIMETAGLIST: + if self.has_tag(tag): + try: + ret[tag] = dftime(self.tag(tag)) + except TypeError: + pass + return ret + + def sha1(self): + """Returns the SHA1 in hex""" + return self.tag("sha1") + + def md5(self): + """Returns the MD5 in hex""" + return self.tag("md5") + + def fragments(self): + """Returns number of file fragments""" + return len(self.byte_runs()) + + def name_type(self): + """Return the contents of the name_type tag""" + return self.tag("name_type") + + def is_virtual(self): + """Returns true if the fi entry is a TSK virtual entry""" + return self.filename() in tsk_virtual_filenames + + def is_dir(self): + """Returns true if file is a directory""" + return self.name_type()=='d' + + def is_file(self): + """Returns true if file is a file""" + return self.name_type()=='r' or self.name_type()==None + + def inode(self): + """Inode; may be a number or SleuthKit x-y-z format""" + return self.tag("inode") + + def allocated_inode(self): + """Returns True if the file's inode data structure is allocated, False otherwise. (Does not return None.)""" + return isone(self.tag("alloc_inode")) + + def allocated_name(self): + """Returns True if the file's name data structure is allocated, False otherwise. (Does not return None.)""" + return isone(self.tag("alloc_name")) + + def allocated(self): + """Returns True if the file is allocated, False if it was not + (that is, if it was deleted or is an orphan). + Note that we need to be tolerant of mixed case, as it was changed. + We also need to tolerate the case of the unalloc tag being used. + """ + if self.filename()=="$OrphanFiles": return False + if self.allocated_inode() and self.allocated_name(): + return True + else: + return isone(self.tag("alloc")) or isone(self.tag("ALLOC")) or not isone(self.tag("unalloc")) + + def compressed(self): + if not self.has_tag("compressed") and not self.has_tag("compressed") : return False + return isone(self.tag("compressed")) or isone(self.tag("COMPRESSED")) + + def encrypted(self): + if not self.has_tag("encrypted") and not self.has_tag("encrypted") : return False + return isone(self.tag("encrypted")) or isone(self.tag("ENCRYPTED")) + + def file_present(self,imagefile=None): + """Returns true if the file is present in the disk image""" + if self.filesize()==0: + return False # empty files are never present + if imagefile==None: + imagefile=self.imagefile # use this one + for hashname in ['md5','sha1']: + oldhash = self.tag(hashname) + if oldhash: + newhash = hashlib.new(hashname,self.contents(imagefile=imagefile)).hexdigest() + return oldhash==newhash + raise ValueError("Cannot process file "+self.filename()+": no hash in "+str(self)) + + def has_contents(self): + """True if the file has one or more bytes""" + return len(self.byte_runs())>0 + + def has_sector(self,s): + """True if sector s is contained in one of the byte_runs.""" + for run in self.byte_runs(): + if run.has_sector(s): return True + return False + + def libmagic(self): + """Returns libmagic string if the string is specified + in the xml, or None otherwise""" + return self.tag("libmagic") + + def content_for_run(self,run=None,imagefile=None): + """ Returns the content for a specific run. This is a convenience feature + which does not touch the file object if an imagefile is provided.""" + if imagefile is None: imagefile=self.imagefile + if run is None: raise ValueError("content_for_run called without a 'run' argument.") + + if run.len == -1: + return chr(0) * run.len + elif hasattr(run,'fill'): + return chr(run.fill) * run.len + else: + imagefile.seek(run.img_offset) + return imagefile.read(run.len) + + def contents(self,imagefile=None,icat_fallback=True): + """ Returns the contents of all the runs concatenated together. For allocated files + this should be the original file contents. """ + if imagefile is None : imagefile=self.imagefile + if imagefile is None : raise ValueError("imagefile is unknown") + if self.encrypted() : raise ValueError("Cannot generate content for encrypted files") + if self.compressed() or imagefile.name.endswith(".aff") or imagefile.name.endswith(".E01"): + if icat_fallback: + # + # For now, compressed files rely on icat rather than python interface + # + offset = safeInt(self.volume.offset) + block_size = safeInt(self.volume.block_size) + if block_size==0: block_size = 512 + inode = self.inode() + if inode : + block_size = 512 + fstype_flag = "" + fstype = self.volume.ftype_str() + if fstype != None: + fstype_flag = '-f' + fstype + cmd = ['icat',fstype_flag,'-b',str(block_size),'-o',str(offset//block_size),imagefile.name,str(inode)] + else: + cmd = ['icat','-b',str(block_size),'-o',str(offset//block_size),imagefile.name,str(inode)] + (data,err) = Popen(cmd, stdout=PIPE,stderr=PIPE).communicate() + # Check for an error + if len(err) > 0 : + #sys.stderr.write("Debug: type(err) = %r.\n" % type(err)) + raise ValueError("icat error (" + str(err).strip() + "): "+" ".join(cmd)) + return data + else : + raise ValueError("Inode missing from file in compressed format.") + raise ValueError("Cannot read raw bytes in compressed disk image") + res = [] + for run in self.byte_runs(): + res.append(self.content_for_run(run=run,imagefile=imagefile)) + return "".join(res) + + def tempfile(self,calcMD5=False,calcSHA1=False): + """Return the contents of imagefile in a named temporary file. If + calcMD5 or calcSHA1 are set TRUE, then the object returned has a + haslib object as self.md5 or self.sha1 with the requested hash.""" + import tempfile + tf = tempfile.NamedTemporaryFile() + if calcMD5: tf.md5 = hashlib.md5() + if calcSHA1: tf.sha1 = hashlib.sha1() + for run in self.byte_runs(): + self.imagefile.seek(run.img_offset) + count = run.len + while count>0: + xfer_len = min(count,1024*1024) # transfer up to a megabyte at a time + buf = self.imagefile.read(xfer_len) + if len(buf)==0: break + tf.write(buf) + if calcMD5: tf.md5.update(buf) + if calcSHA1: tf.sha1.update(buf) + count -= xfer_len + tf.flush() + return tf + + def savefile(self,filename=None): + """Saves the file.""" + with open(filename,"wb") as f: + for run in self.byte_runs(): + self.imagefile.seek(run.img_offset) + count = run.len + while count>0: + xfer_len = min(count,1024*1024) # transfer up to a megabyte at a time + buf = self.imagefile.read(xfer_len) + if len(buf)==0: break + f.write(buf) + count -= xfer_len + + + def frag_start_sector(self,fragment): + return self.byte_runs()[fragment].img_offset / 512 + + def name_type(self): + return self.tag("name_type") + +class fileobject_dom(fileobject): + """file objects created through the DOM. Each object has the XML document + stored in the .doc attribute.""" + def __init__(self,xmldoc,imagefile=None): + fileobject.__init__(self,imagefile=imagefile) + self.doc = xmldoc + + def tag(self,name): + """Returns the wholeText for any given NAME. Raises KeyError + if the NAME does not exist.""" + try: + return self.doc.getElementsByTagName(name)[0].firstChild.wholeText + except IndexError: + # Check for a hash tag with legacy API + if name in ['md5','sha1','sha256']: + for e in self.doc.getElementsByTagName('hashdigest'): + if e.getAttribute('type').lower()==name: + return e.firstChild.wholeText + raise KeyError(name+" not in XML") + + def has_tag(self,name) : + try: + temp=self.doc.getElementsByTagName(name)[0].firstChild.wholeText + return True + except IndexError: + # Check for a hash tag with legacy API + if name in ['md5','sha1','sha256']: + for e in self.doc.getElementsByTagName('hashdigest'): + if e.getAttribute('type').lower()==name: + return True + return False + + def byte_runs(self): + """Returns a sorted array of byte_run objects. + """ + ret = [] + try: + for run in self.doc.getElementsByTagName("byte_runs")[0].childNodes: + b = byte_run() + if run.nodeType==run.ELEMENT_NODE: + b.decode_xml_attributes(run.attributes) + ret.append(b) + except IndexError: + pass + ret.sort(key=lambda r:r.file_offset) + return ret + +class saxobject: + # saxobject is a mix-in that makes it easy to turn XML tags into functions. + # If the sax tag is registered, then a function with the tag's name is created. + # Calling the function returns the value for the tag that is stored in the _tags{} + # dictionary. The _tags{} dictionary is filled by the _end_element() method that is defined. + # For fileobjects all tags are remembered. + def __init__(self): + self._tags = {} + def tag(self,name): + """Returns the XML text for a given NAME.""" + return self._tags.get(name,None) + def has_tag(self,name) : return name in self._tags + +def register_sax_tag(tagclass,name): + setattr(tagclass,name,lambda self:self.tag(name)) + + +class fileobject_sax(fileobject,saxobject): + """file objects created through expat. This class is created with a tags array and a set of byte runs.""" + def __init__(self,imagefile=None,xml=None): + fileobject.__init__(self,imagefile=imagefile) + saxobject.__init__(self) + self._byte_runs = [] + def byte_runs(self): + """Returns an array of byte_run objects.""" + return self._byte_runs + + +class volumeobject_sax(saxobject): + """A class that represents the volume.""" + def __init__(self): + if hasattr(saxobject, "__init__"): + saxobject.__init__(self) + self.offset = 0 + self.block_size = 0 + + def __str__(self): + return "volume "+(str(self._tags)) + + def partition_offset(self): + try: + return self.tag('partition_offset') + except KeyError: + return self.tag('Partition_Offset') + +register_sax_tag(volumeobject_sax,'ftype') +register_sax_tag(volumeobject_sax,'ftype_str') +register_sax_tag(volumeobject_sax,'block_count') +register_sax_tag(volumeobject_sax,'first_block') +register_sax_tag(volumeobject_sax,'last_block') + +class imageobject_sax(saxobject): + """A class that represents the disk image""" +register_sax_tag(imageobject_sax,'imagesize') +register_sax_tag(imageobject_sax,'image_filename') + + +class creatorobject_sax(saxobject): + """A class that represents the section of a DFXML file""" +for tag in ['creator','program','version']: + register_sax_tag(creatorobject_sax,tag) + +################################################################ + +################################################################ + +def safe_b64decode(b64data): + """ + This function takes care of the logistics of base64 decoding XML data in Python 2 and 3. + Recall that Python3 requires b64decode operate on bytes, not a string. + Ref: + A forum post that noted several encoding differences between Python 2 and 3: + + """ + if sys.version_info.major == 2: + return base64.b64decode(b64data).decode("unicode_escape") + elif sys.version_info.major == 3: + dtype = str(type(b64data)) + to_decode = None + if dtype == "": + to_decode = b64data.encode("ascii") + elif dtype == "": + to_decode = b64data + return base64.b64decode(to_decode).decode("unicode_escape") + else: + raise Exception("Not sure how to parse base64 data outside Python versions 2 or 3.") + +class xml_reader: + def __init__(self): + self.cdata = None + self.tagstack = ['xml'] + + def _char_data(self, data): + """Handles XML data""" + if self.cdata != None: + self.cdata += data + + def process_xml_stream(self,xml_stream,callback,preserve_fis=False): + "Run the reader on a given XML input stream" + self.callback = callback + self.preserve_fis = preserve_fis + self.fi_history = [] + import xml.parsers.expat + p = xml.parsers.expat.ParserCreate() + p.StartElementHandler = self._start_element + p.EndElementHandler = self._end_element + p.CharacterDataHandler = self._char_data + p.ParseFile(xml_stream) + +class regxml_reader(xml_reader): + def __init__(self,flags=None): + self.flags = flags + xml_reader.__init__(self) #TODO wait, shouldn't flags go in here? + self.objectstack = [] + self.registry_object = None + self.nonce = 0 + + def _start_element(self, name, attrs): + """ + The objectstack conditionally grows, depending on type of element processed + * msregistry (hive): Create a new msregistry object, append to objectstack + * key (node): Create a new key object, append to objectstack + * mtime: The text is going to become a property of the parent element; do not append to objectstack. + * value: Create a new value object, append to objectstack. + """ + new_object = None + if name in ["msregistry","hive"]: + new_object = registry_object() + self.objectstack.append(new_object) + self.registry_object = new_object + elif name in ["key","node"]: + new_object = registry_key_object() + + #Note these two tests for root and parent _are_ supposed to be independent tests. + if attrs.get("root",None) == "1": + new_object._type = "root" + else: + new_object._type = "" + + if len(self.objectstack) > 1: + new_object.parent_key = self.objectstack[-1] + + #Sanity check: root key implies no parent + if new_object.type() == "root": + assert new_object.parent_key == None + #Sanity check: no parent implies root key --OR-- recovered key + if new_object.parent_key == None: + assert new_object.used == False or new_object.type() == "root" + + #Define new_object.name + #Force a name for keys. If the key has no recorded name, apply artificial name prefix to nonce. + name_data = attrs.get("name") + if name_data == None: + new_object._name = "__DFXML_NONCE_" + str(self.nonce) + self.nonce += 1 + else: + enc = attrs.get("name_encoding") + if enc == "base64": + new_object._name = safe_b64decode(name_data) + else: + new_object._name = name_data + + if new_object.parent_key == None: + new_object._full_path = "\\" + new_object.name() + # TODO need a name scheme for orphan references, when we start processing orphans + else: + new_object._full_path = new_object.parent_key.full_path() + "\\" + new_object.name() + self.objectstack.append(new_object) + elif name in ["value"]: + new_object = registry_value_object() + new_object.parent_key = self.objectstack[-1] + new_object._type = attrs.get("type",None) + + if new_object.type() == "string-list": + new_object.strings = [] + + #Store decoded name + if attrs.get("default",None) == "1": + new_object._name = "Default" + if attrs.get("name",attrs.get("key",None)) is not None: + #TODO Notify: concurrently set name attribute and default-name flag + pass + else: + enc = attrs.get("name_encoding",attrs.get("key_encoding")) + name_data = attrs.get("name",attrs.get("key",None)) + if enc == "base64": + try: + new_object._name = base64.b64decode(name_data.encode("ascii")).decode("unicode_escape") + except: + sys.stderr.write("name_data={} type={}\n".format(name_data,type(name_data))) + raise + else: + new_object._name = name_data + new_object._full_path = new_object.parent_key.full_path() + "\\" + new_object.name() + + #Store decoded value + new_object.value_data = self.decoded_value(attrs) + self.objectstack.append(new_object) + elif name in ["mtime"]: + self.cdata = "" + elif name in ["string"]: + self.cdata = "" + elif name in ["byte_runs"]: + pass + elif name in ["byte_run"]: + parent = self.objectstack[-1] + parent._byte_runs.append(byte_run(file_offset=attrs.get("file_offset"), len=attrs.get("len"))) + else: + raise ValueError("regxml_reader._start_element: Don't know how to start element %s.\n" % name) + #Give all cell objects a handle on the registry + if new_object != None: + new_object.registry_handle = self.registry_object + + def decoded_value(self, attrs): + value_data = attrs.get("value",None) + if value_data: + # TODO adjust hivexml to not use a plain "encoding" attribute + value_encoding = attrs.get("encoding", attrs.get("value_encoding")) + if value_encoding == "base64": + if sys.version_info.major>2: + value_data = bytes(value_data,encoding='ascii') + return base64.b64decode(value_data) + else: + return value_data + else: + return None + + def _end_element(self, name): + """ + The callback is invoked for each stack-popping operation, except the root. + """ + # TODO sanity-check the objectstack + if name in ["msregistry","hive"]: + pass + elif name in ["key","node"]: + finished_object = self.objectstack.pop() + #Add finished object to object index + if finished_object.full_path() in self.registry_object.object_index: + raise ValueError("regxml_reader._end_element: Same key path found more than once: " + + finished_object.full_path()) + self.registry_object.object_index[finished_object.full_path()] = finished_object + self.callback(finished_object) + elif name in ["mtime"]: + self.objectstack[-1]._mtime = dftime(self.cdata) + self.cdata = None + elif name in ["value"]: + finished_object = self.objectstack.pop() + #TODO Simplify once hivexml is patched to have value/@value instead of value/[cdata] + if finished_object.value_data == None: + finished_object.value_data = self.cdata + self.callback(finished_object) + elif name in ["string"]: + value_object = self.objectstack[-1] + if value_object.strings == None: + raise ValueError("regxml_reader._end_element: parsing error, string element found, but parent's type can't support a string list.") + value_object.strings.append(self.cdata) + self.cdata = None + elif name in ["byte_runs","byte_run"]: + pass + else: + raise ValueError("regxml_reader._end_element: Don't know how to end element %s.\n" % name) + +class fileobject_reader(xml_reader): + """Class which uses the SAX expat-based XML reader. + Reads an FIWALK XML input file and automatically creates + volumeobject_sax and fileobject_sax objects, but just returns the filoeobject + objects..""" + def __init__(self,imagefile=None,flags=None): + self.creator = None + self.volumeobject = None + self.fileobject = None + self.imageobject = imageobject_sax() + self.imagefile = imagefile + self.flags = flags + self._sax_fi_pointer = None + xml_reader.__init__(self) + + @property + def _sax_fi_pointer(self): + """ + This internal field of a fileobject_reader is a simple state machine. A DFXML stream can contain fileobjects which contain original_fileobjects, which require the same parsing mechanisms. This pointer saves on duplicating code with the SAX parser. + + Type: None, or dfxml.fileobject. Type enforced by the setter method. + """ + return self._sax_fi_pointer_ + @_sax_fi_pointer.setter + def _sax_fi_pointer(self, val): + if val is None: + self._sax_fi_pointer_ = None + else: + assert isinstance(val, fileobject) + self._sax_fi_pointer_ = val + + def _start_element(self, name, attrs): + """ Handles the start of an element for the XPAT scanner""" + _logger.debug("fileobject_reader._start_element: name = %r" % name) + self.tagstack.append(name) + self.cdata = "" # new element, so reset the data + if name=="volume": + self.volumeobject = volumeobject_sax() + self.volumeobject.block_size = 512 # reasonable default + self.volumeobject.image = self.imageobject + if "offset" in attrs: + self.volumeobject.offset = int(attrs["offset"]) + return + if name=="block_size": + pass + if name=="fileobject": + self.fileobject = fileobject_sax(imagefile=self.imagefile) + self.fileobject.volume = self.volumeobject + self._sax_fi_pointer = self.fileobject + return + if name=="original_fileobject": + self.fileobject.original_fileobject = fileobject_sax(imagefile=self.imagefile) + #self.original_fileobject.volume = self.volumeobject #TODO + self._sax_fi_pointer = self.fileobject.original_fileobject + return + if name=='hashdigest': + self.hashdigest_type = attrs['type'] + if self.fileobject and (name=="run" or name=="byte_run"): + b = byte_run() + b.decode_sax_attributes(attrs) + self.fileobject._byte_runs.append(b) + return + + + def _end_element(self, name): + """Handles the end of an element for the XPAT scanner""" + assert(self.tagstack.pop()==name) # make sure that the stack matches + if name=="volume": + self.volumeobject = None + return + if name=="block_size" and len(self.tagstack) > 1 : + if self.tagstack[-1] == "volume" : + self.volumeobject.block_size = int(self.cdata) + self.cdata=None + return + if name=="fileobject": + self.callback(self.fileobject) + if self.preserve_fis: + self.fi_history.append(self.fileobject) + self.fileobject = None + return + if name=="original_fileobject": + self._sax_fi_pointer = self.fileobject + return + if name=='hashdigest' and len(self.tagstack)>0: + top = self.tagstack[-1] # what the hash was for + alg = self.hashdigest_type.lower() # name of the hash algorithm used + if top=='byte_run': + self._sax_fi_pointer._byte_runs[-1].hashdigest[alg] = self.cdata + if top in ["fileobject", "original_fileobject"]: + self._sax_fi_pointer._tags[alg] = self.cdata # legacy + self._sax_fi_pointer.hashdigest[alg] = self.cdata + self.cdata = None + return + + if self._sax_fi_pointer: # in file objects, all tags are remembered + self._sax_fi_pointer._tags[name] = self.cdata + self.cdata = None + return + # Special case: fn + # gets put in fn + if name in ['image_filename','imagefile'] and self.tagstack[-1]=='source': + self.imageobject._tags['image_filename'] = self.cdata + +class volumeobject_reader(xml_reader): + """Reads just the section of a DFXML file""" + def __init__(self): + self.volumeobject = False + xml_reader.__init__(self) + self.imageobject = imageobject_sax() + + def _start_element(self, name, attrs): + """ Handles the start of an element for the XPAT scanner""" + self.tagstack.append(name) + if name=="volume": + self.volumeobject = volumeobject_sax() + self.volumeobject.image = self.imageobject + return + if name=="fileobject": + self.cdata = None # don't record this + return + self.cdata = "" # new element; otherwise data is ignored + + def _end_element(self, name): + """Handles the end of an eleement for the XPAT scanner""" + assert(self.tagstack.pop()==name) + if name=="volume": + self.callback(self.volumeobject) + self.volumeobject = None + return + if self.tagstack[-1]=='volume' and self.volumeobject: # in the volume + self.volumeobject._tags[name] = self.cdata + self.cdata = None + return + if self.tagstack[-1] in ['fiwalk','dfxml']: + self.imageobject._tags[name] = self.cdata + return + + # Special case: fn gets put in fn + if name in ['image_filename','imagefile'] and self.tagstack[-1]=='source': + self.imageobject._tags['image_filename'] = self.cdata + return + + +class FinishedReadingCreator(Exception): + """Class to indicate that creator object has been read""" + +class creatorobject_reader(xml_reader): + """Reads the section of a DFXML file""" + def __init__(self): + self.creatorobject = False + xml_reader.__init__(self) + + def _start_element(self, name, attrs): + """ Handles the start of an element for the XPAT scanner""" + self.tagstack.append(name) + if name=="creator": + self.creatorobject = creatorobject_sax() + return + if self.creatorobject: + self.cdata = "" # capture cdata for creatorobject + + def _end_element(self, name): + """Handles the end of an eleement for the XPAT scanner""" + assert(self.tagstack.pop()==name) + if name=="creator": + self.callback(self.creatorobject) + self.creatorobject = None + raise FinishedReadingCreator("Done") + if self.tagstack[-1]=='creator' and self.creatorobject: # in the creator + self.creatorobject._tags[name] = self.cdata + self.cdata = None + return + + +def combine_runs(runs): + """Given an array of bytrun elements, combine the runs and return a new array.""" + if runs==[]: return [] + ret = [runs[0]] + for run in runs[1:]: + # if the last one ends where this run begins, just extend + # otherwise append + last = ret[-1] + if last.img_offset+last.len == run.img_offset: + ret[-1] = byte_run(img_offset = last.img_offset, + len = last.len + run.len) + continue + else: + ret.append(run) + return ret + +class extentdb: + """A class to a database of extents and report if they collide. + Currently this is not an efficient implementation, but it could become + more efficient in the future. When it does, every program that uses + this implementation will get faster too! Each extent is represented + as a byte_run object""" + def __init__(self,sectorsize=512): + self.db = [] # the database of runs + self.sectorsize = 512 + pass + + def report(self,f): + """Print information about the database""" + f.write("sectorsize: %d\n" % self.sectorsize) + for run in sorted(self.db): + f.write(" [@%8d ; %8d]\n" % (run.img_offset,run.len)) + f.write("total entries in database: %d\n\n" % len(r)) + + def sectors_for_bytes(self,count): + """Returns the number of sectors necessary to hold COUNT bytes""" + return (count+self.sectorsize-1)//self.sectorsize + + def sectors_for_run(self,run): + """Returns an array of the sectors for a given run""" + start_sector = run.img_offset/self.sectorsize + sector_count = self.sectors_for_bytes(run.len) + return range(start_sector,start_sector+sector_count) + + def run_for_sector(self,sector_number,count=1): + """Returns the run for a specified sector, and optionally a count of sectors""" + return byte_run(len=count*self.sectorsize,img_offset=sector_number * self.sectorsize) + + def intersects(self,extent): + """Returns the intersecting extent, or None if there is none""" + if extent.len==0: return True # 0 length intersects with everything + if extent.len<0: raise ValueError("Length cannot be negative:"+str(extent)) + start = extent.img_offset + stop = extent.img_offset+extent.len + for d in self.db: + if d.img_offset <= start < d.img_offset+d.len: return d + if d.img_offset < stop < d.img_offset+d.len: return d + if startdb + warn = "" + if result != want: + warn = " (!)" + print("a=%s b=%s want=%s greater=%s%s" % (da,db,want,result,warn)) + + if options.regress: + print("Testing unicode value parsing.") + #Test base64 encoding of the "Registered" symbol, encountered in a key name in the M57-Patents corpus. + test_unicode_string = "\xae" + if sys.version_info.major == 2: + #The test string doesn't quite get defined right that way in Python 2 + test_unicode_string = unicode(test_unicode_string, encoding="latin-1") + test_unicode_string_escaped = test_unicode_string.encode("unicode_escape") + test_base64_bytes = base64.b64encode(test_unicode_string_escaped) + elif sys.version_info.major == 3: + test_unicode_string_escaped = test_unicode_string.encode("unicode_escape") + test_base64_bytes = base64.b64encode(test_unicode_string_escaped) + else: + #Just hard-code value, no examples yet for this language version. + test_base64_bytes = b'XHhhZQ==' + test_base64_string = test_base64_bytes.decode("ascii") + #test_base64_string is the kind of string data you'd expect to encounter in base64-encoded values processing RegXML. + assert test_unicode_string == safe_b64decode(test_base64_bytes) + assert test_unicode_string == safe_b64decode(test_base64_string) + print("Unicode value parsing good!") + print("Testing time string parsing") + test_rfc822tdatetime = rfc822Tdatetime("26 Jun 2012 22:34:58 -0700") + assert test_rfc822tdatetime.tzinfo is not None + print("Time string parsing good!") + print("Testing dftime values") + #check_equal("1900-01-02T02:03:04Z",-2208895016,True) #AJN time.mktime doesn't seem to support old times any more + a_pacific_dftime = dftime("26 Jun 2012 22:34:58 -0700") + assert 0.0 == dftime(a_pacific_dftime.iso8601()).timestamp() - a_pacific_dftime.timestamp() + check_equal("2000-01-02T02:03:04Z","2000-01-02T03:03:04-0100",False) + check_equal("2000-01-02T02:03:04-0100","2000-01-02T02:03:04-0100",True) + check_equal("2000-01-02T02:03:04-0100","2000-01-02T02:03:04-0200",False) + check_equal("2000-01-02T02:03:04-0100","2000-01-02T01:03:04-0200",True) + check_greater("2000-01-02T04:04:05-0100","2000-01-02T03:04:05-0100",True) + check_greater("2000-01-02T03:04:05-0200","2000-01-02T03:04:05-0100",True) + check_greater("2009-11-17T00:33:30.9375Z","2009-11-17T00:33:30Z",True) + check_equal("2009-11-17T00:33:30.9375Z","2009-11-17T00:33:30Z",False) + check_equal("2009-11-17T00:33:30.0000Z","2009-11-17T00:33:30Z",True) + check_equal("27 Jun 2012 06:02:00 -0000","27 Jun 2012 05:02:00 -0100",True) + check_equal("27 Jun 2012 06:02:00 -0000","2012-06-27T06:02:00Z",True) + check_equal("26 Jun 2012 22:34:58 -0700","2012-06-27T05:34:58Z", True) + print("dftime values passed.") + print("Testing byte_run overlap engine:") + db = extentdb() + a = byte_run(img_offset=0,len=5) + db.add(a) + b = byte_run(5,5) + db.add(b) + try: + assert db.intersects(byte_run(0,5))==byte_run(0,5) + except: + print(type(cmp)) + print(db.intersects(byte_run(0,5))) + print(byte_run(0,5)) + raise + assert db.intersects(byte_run(0,1)) + assert db.intersects(byte_run(2,3)) + assert db.intersects(byte_run(4,1)) + assert db.intersects(byte_run(5,1)) + assert db.intersects(byte_run(6,1)) + assert db.intersects(byte_run(9,1)) + assert db.intersects(byte_run(-1,5)) + assert db.intersects(byte_run(-1,10)) + assert db.intersects(byte_run(-1,11)) + assert db.intersects(byte_run(-1,1))==None + assert db.intersects(byte_run(10,1))==None + print("Overlap engine good!") + assert re.sub(rx_xmlns, "", """""") == "" + assert re.sub(rx_xmlns, "", """""") == "" + assert re.sub(rx_xmlns, "", """""") == """""" + assert re.sub(rx_xmlns, "", """""") == """""" + print("XML namespace regex good!") diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/Makefile tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/Makefile --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/Makefile 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/Makefile 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,35 @@ + +SHELL = /bin/bash + +PYTHON2 = python2.7 +PYTHON3 = python3.3 + +all: check + +check: + for PY in "$(PYTHON3)" "$(PYTHON2)"; do \ + echo Testing Python: $$PY >&2; \ + $$PY Objects.py && \ + $$PY test_diffing_CellObject.py && \ + $$PY test_diffing_FileObject.py && \ + $$PY test_diffing_HiveObject.py && \ + $$PY test_diffing_ByteRuns.py && \ + $$PY test_diffing_TimestampObject.py && \ + $$PY test_diffing_VolumeObject.py && \ + $$PY test_ByteRun.py && \ + $$PY test_ByteRuns.py && \ + $$PY test_FileObject_byte_run_facets.py && \ + $$PY test_FileObject_from_stat.py && \ + $$PY test_RegXMLObject.py && \ + $$PY test_VolumeObject_hash.py && \ + $$PY verify_differential_dfxml_01.py && \ + $$PY verify_differential_dfxml_23.py ; \ + done + ./test_cat_partitions.sh + ./test_differential_dfxml.sh + +clean: + rm -f differential_dfxml_test_[02]* + rm -f verify_differential_dfxml_{01,23}.py-test[12].xml + rm -f test_difference_counts.py-d* + rm -f test_cat_partitions.sh.dfxml diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/Objects.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/Objects.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/Objects.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/Objects.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,2638 @@ + +""" +This file re-creates the major DFXML classes with an emphasis on type safety, serializability, and de-serializability. + +With this module, reading disk images or DFXML files is done with the parse or iterparse functions. Writing DFXML files can be done with the DFXMLObject.print_dfxml function. +""" + +__version__ = "0.2.2" + +#Remaining roadmap to 1.0.0: +# * Documentation. +# * User testing. +# * Compatibility with the DFXML schema, version >1.1.0. + +import logging +import re +import copy +import xml.etree.ElementTree as ET +import subprocess +import dfxml +import os +import sys + +_logger = logging.getLogger(os.path.basename(__file__)) + +#Contains: (namespace, local name) qualified XML element name pairs +_warned_elements = set([]) +_warned_byterun_attribs = set([]) + +#Contains: Unexpected 'facet' values on byte_runs elements. +_warned_byterun_facets = set([]) + +#Issue some log statements only once per program invocation. +_nagged_alloc = False +_warned_byterun_badtypecomp = False + +def _ET_tostring(e): + """Between Python 2 and 3, there are some differences in the ElementTree library's tostring() behavior. One, the method balks at the "unicode" encoding in 2. Two, in 2, the XML prototype's output with every invocation. This method serves as a wrapper to deal with those issues.""" + if sys.version_info[0] < 3: + tmp = ET.tostring(e, encoding="UTF-8") + if tmp[0:2] == "\n")+3 : ] + else: + return tmp + else: + return ET.tostring(e, encoding="unicode") + +def _boolcast(val): + """Takes Boolean values, and 0 or 1 in string or integer form, and casts them all to Boolean. Preserves nulls. Balks at everything else.""" + if val is None: + return None + if val in [True, False]: + return val + + _val = val + if val in ["0", "1"]: + _val = int(val) + if _val in [0, 1]: + return _val == 1 + + _logger.debug("val = " + repr(val)) + raise ValueError("Received a not-straightforwardly-Boolean value. Expected some form of 0, 1, True, or False.") + +def _bytecast(val): + """Casts a value as a byte string. If a character string, assumes a UTF-8 encoding.""" + if val is None: + return None + if isinstance(val, bytes): + return val + return _strcast(val).encode("utf-8") + +def _intcast(val): + """Casts input integer or string to integer. Preserves nulls. Balks at everything else.""" + if val is None: + return None + if isinstance(val, int): + return val + + if isinstance(val, str): + if val[0] == "-": + if val[1:].isdigit(): + return int(val) + else: + if val.isdigit(): + return int(val) + + _logger.debug("val = " + repr(val)) + raise ValueError("Received a non-int-castable value. Expected an integer or an integer as a string.") + +def _read_differential_annotations(annodict, element, annoset): + """ + Uses the shorthand-to-attribute mappings of annodict to translate attributes of element into annoset. + """ + #_logger.debug("annoset, before: %r." % annoset) + #Start with inverting the dictionary + _d = { annodict[k].replace("delta:",""):k for k in annodict } + #_logger.debug("Inverted dictionary: _d = %r" % _d) + for attr in element.attrib: + #_logger.debug("Looking for differential annotations: %r" % element.attrib) + (ns, an) = _qsplit(attr) + if an in _d and ns == dfxml.XMLNS_DELTA: + #_logger.debug("Found; adding %r." % _d[an]) + annoset.add(_d[an]) + #_logger.debug("annoset, after: %r." % annoset) + +def _qsplit(tagname): + """Requires string input. Returns namespace and local tag name as a pair. I could've sworn this was a basic implementation gimme, but ET.QName ain't it.""" + _typecheck(tagname, str) + if tagname[0] == "{": + i = tagname.rfind("}") + return ( tagname[1:i], tagname[i+1:] ) + else: + return (None, tagname) + +def _strcast(val): + if val is None: + return None + return str(val) + +def _typecheck(obj, classinfo): + if not isinstance(obj, classinfo): + _logger.info("obj = " + repr(obj)) + if isinstance(classinfo, tuple): + raise TypeError("Expecting object to be one of the types %r." % (classinfo,)) + else: + raise TypeError("Expecting object to be of type %r." % classinfo) + +class DFXMLObject(object): + def __init__(self, *args, **kwargs): + self.command_line = kwargs.get("command_line") + self.version = kwargs.get("version") + self.sources = kwargs.get("sources", []) + self.dc = kwargs.get("dc", dict()) + + self._namespaces = dict() + self._volumes = [] + self._files = [] + + input_volumes = kwargs.get("volumes") or [] + input_files = kwargs.get("files") or [] + for v in input_volumes: + self.append(v) + for f in input_files: + self.append(f) + + #Add default namespaces + self.add_namespace("", dfxml.XMLNS_DFXML) + self.add_namespace("dc", dfxml.XMLNS_DC) + + def __iter__(self): + """Yields all VolumeObjects, recursively their FileObjects, and the FileObjects directly attached to this DFXMLObject, in that order.""" + for v in self._volumes: + yield v + for f in v: + yield f + for f in self._files: + yield f + + def add_namespace(self, prefix, url): + self._namespaces[prefix] = url + ET.register_namespace(prefix, url) + + def append(self, value): + if isinstance(value, VolumeObject): + self._volumes.append(value) + elif isinstance(value, FileObject): + self._files.append(value) + else: + _logger.debug("value = %r" % value) + raise TypeError("Expecting a VolumeObject or a FileObject. Got instead this type: %r." % type(value)) + + def iter_namespaces(self): + """Yields (prefix, url) pairs of each namespace registered in this DFXMLObject.""" + for prefix in self._namespaces: + yield (prefix, self._namespaces[prefix]) + + def populate_from_Element(self, e): + if "version" in e.attrib: + self.version = e.attrib["version"] + + for elem in e.findall(".//*"): + (ns, ln) = _qsplit(elem.tag) + if ln == "command_line": + self.command_line = elem.text + elif ln == "image_filename": + self.sources.append(elem.text) + + def print_dfxml(self, output_fh=sys.stdout): + """Memory-efficient DFXML document printer. However, it assumes the whole element tree is already constructed.""" + pe = self.to_partial_Element() + dfxml_wrapper = _ET_tostring(pe) + dfxml_foot = "" + #Check for an empty element + if dfxml_wrapper.strip()[-3:] == " />": + dfxml_head = dfxml_wrapper.strip()[:-3] + ">" + elif dfxml_wrapper.strip()[-2:] == "/>": + dfxml_head = dfxml_wrapper.strip()[:-2] + ">" + else: + dfxml_head = dfxml_wrapper.strip()[:-len(dfxml_foot)] + + output_fh.write("""\n""") + output_fh.write(dfxml_head) + output_fh.write("\n") + _logger.debug("Writing %d volume objects." % len(self._volumes)) + for v in self._volumes: + v.print_dfxml(output_fh) + output_fh.write("\n") + _logger.debug("Writing %d file objects." % len(self._files)) + for f in self._files: + e = f.to_Element() + output_fh.write(_ET_tostring(e)) + output_fh.write("\n") + output_fh.write(dfxml_foot) + output_fh.write("\n") + + def to_Element(self): + outel = self.to_partial_Element() + for v in self._volumes: + tmpel = v.to_Element() + outel.append(tmpel) + for f in self._files: + tmpel = f.to_Element() + outel.append(tmpel) + return outel + + def to_dfxml(self): + """Serializes the entire DFXML document tree into a string. Then returns that string. RAM-intensive. Most will want to use print_dfxml() instead""" + return _ET_tostring(self.to_Element()) + + def to_partial_Element(self): + outel = ET.Element("dfxml") + + tmpel0 = ET.Element("metadata") + for key in sorted(self.dc): + _typecheck(key, str) + if ":" in key: + raise ValueError("Dublin Core key-value entries should have keys without the colon character. If this causes an interesting namespace issue for you, please report it as a bug.") + tmpel1 = ET.Element("dc:" + key) + tmpel1.text = self.dc[key] + tmpel0.append(tmpel1) + outel.append(tmpel0) + + if self.command_line: + tmpel0 = ET.Element("creator") + tmpel1 = ET.Element("execution_environment") + tmpel2 = ET.Element("command_line") + tmpel2.text = self.command_line + tmpel1.append(tmpel2) + tmpel0.append(tmpel1) + outel.append(tmpel0) + + if len(self.sources) > 0: + tmpel0 = ET.Element("source") + for source in self.sources: + tmpel1 = ET.Element("image_filename") + tmpel1.text = source + tmpel0.append(tmpel1) + outel.append(tmpel0) + + if self.version: + outel.attrib["version"] = self.version + + #Apparently, namespace setting is only available with the write() function, which is memory-impractical for significant uses of DFXML. + #Ref: http://docs.python.org/3.3/library/xml.etree.elementtree.html#xml.etree.ElementTree.ElementTree.write + for prefix in self._namespaces: + attrib_name = "xmlns" + if prefix != "": + attrib_name += ":" + prefix + outel.attrib[attrib_name] = self._namespaces[prefix] + + return outel + + @property + def command_line(self): + return self._command_line + + @command_line.setter + def command_line(self, value): + self._command_line = _strcast(value) + + @property + def dc(self): + """The Dublin Core dictionary of key-value pairs for this document. Typically, "type" is "Hash List", or "Disk Image". Keys should be strings not containing colons, values should be strings. If this causes an issue for you, please report it as a bug.""" + return self._dc + + @dc.setter + def dc(self, value): + _typecheck(value, dict) + self._dc = value + + @property + def files(self): + """List of file objects directly attached to this DFXMLObject. No setter for now.""" + return self._files + + @property + def namespaces(self): + raise AttributeError("The namespaces dictionary should not be directly accessed; instead, use .iter_namespaces().") + + @property + def sources(self): + return self._sources + + @sources.setter + def sources(self, value): + if not value is None: + _typecheck(value, list) + self._sources = value + + @property + def version(self): + return self._version + + @version.setter + def version(self, value): + self._version = _strcast(value) + + @property + def volumes(self): + """List of volume objects directly attached to this DFXMLObject. No setter for now.""" + return self._volumes + + +class RegXMLObject(object): + def __init__(self, *args, **kwargs): + self.metadata = kwargs.get("metadata") + self.creator = kwargs.get("creator") + self.source = kwargs.get("source") + self.version = kwargs.get("version") + self._hives = [] + self._cells = [] + self._namespaces = dict() + input_hives = kwargs.get("hives") or [] # In case kwargs["hives"] = None. + input_cells = kwargs.get("cells") or [] + for hive in input_hives: + self.append(hive) + for cell in input_cells: + self.append(cells) + + def __iter__(self): + """Yields all HiveObjects, recursively their CellObjects, and the CellObjects directly attached to this RegXMLObject, in that order.""" + for h in self._hives: + yield h + for c in h: + yield c + for c in self._cells: + yield c + + def append(self, value): + if isinstance(value, HiveObject): + self._hives.append(value) + elif isinstance(value, CellObject): + self._cells.append(value) + else: + _logger.debug("value = %r" % value) + raise TypeError("Expecting a HiveObject or a CellObject. Got instead this type: %r." % type(value)) + + def print_regxml(self, output_fh=sys.stdout): + """Serializes and prints the entire object, without constructing the whole tree.""" + regxml_wrapper = _ET_tostring(self.to_partial_Element()) + #_logger.debug("regxml_wrapper = %r." % regxml_wrapper) + regxml_foot = "" + #Check for an empty element + if regxml_wrapper.strip()[-3:] == " />": + regxml_head = regxml_wrapper.strip()[:-3] + ">" + elif regxml_wrapper.strip()[-2:] == "/>": + regxml_head = regxml_wrapper.strip()[:-2] + ">" + else: + regxml_head = regxml_wrapper.strip()[:-len(regxml_foot)] + + output_fh.write(regxml_head) + output_fh.write("\n") + for hive in self._hives: + hive.print_regxml(output_fh) + output_fh.write(regxml_foot) + output_fh.write("\n") + + def to_Element(self): + outel = self.to_partial_Element() + + for hive in self._hives: + tmpel = hive.to_Element() + outel.append(tmpel) + + for cell in self._cells: + tmpel = cell.to_Element() + outel.append(tmpel) + + return outel + + def to_partial_Element(self): + """ + Creates the wrapping RegXML element. No hives, no cells. Saves on creating an entire Element tree in memory. + """ + outel = ET.Element("regxml") + + if self.version: + outel.attrib["version"] = self.version + + return outel + + def to_regxml(self): + """Serializes the entire RegXML document tree into a string. Returns that string. RAM-intensive. Most will want to use print_regxml() instead.""" + return _ET_tostring(self.to_Element()) + + +class VolumeObject(object): + + _all_properties = set([ + "annos", + "allocated_only", + "block_count", + "block_size", + "byte_runs", + "first_block", + "ftype", + "ftype_str", + "last_block", + "partition_offset", + "original_volume", + "sector_size" + ]) + + _diff_attr_names = { + "new":"delta:new_volume", + "deleted":"delta:deleted_volume", + "modified":"delta:modified_volume", + "matched":"delta:matched" + } + + #TODO There may be need in the future to compare the annotations as well. It complicates make_differential_dfxml too much for now. + _incomparable_properties = set([ + "annos" + ]) + + def __init__(self, *args, **kwargs): + self._files = [] + self._annos = set() + self._diffs = set() + + for prop in VolumeObject._all_properties: + if prop in ["annos", "files"]: + continue + setattr(self, prop, kwargs.get(prop)) + + def __iter__(self): + """Yields all FileObjects directly attached to this VolumeObject.""" + for f in self._files: + yield f + + def __repr__(self): + parts = [] + for prop in VolumeObject._all_properties: + #Skip outputting the files list. + if prop == "files": + continue + val = getattr(self, prop) + if not val is None: + parts.append("%s=%r" % (prop, val)) + return "VolumeObject(" + ", ".join(parts) + ")" + + def append(self, value): + _typecheck(value, FileObject) + self._files.append(value) + + def compare_to_original(self): + self._diffs = self.compare_to_other(self.original_volume, True) + + def compare_to_other(self, other, ignore_original=False): + """Returns a set of all the properties found to differ.""" + _typecheck(other, VolumeObject) + diffs = set() + for prop in VolumeObject._all_properties: + if prop in VolumeObject._incomparable_properties: + continue + if ignore_original and prop == "original_volume": + continue + + #_logger.debug("getattr(self, %r) = %r" % (prop, getattr(self, prop))) + #_logger.debug("getattr(other, %r) = %r" % (prop, getattr(other, prop))) + + #Allow file system type to be case-insensitive + if prop == "ftype_str": + o = getattr(other, prop) + if o: o = o.lower() + s = getattr(self, prop) + if s: s = s.lower() + if s != o: + diffs.add(prop) + else: + if getattr(self, prop) != getattr(other, prop): + diffs.add(prop) + return diffs + + def populate_from_Element(self, e): + global _warned_elements + _typecheck(e, (ET.Element, ET.ElementTree)) + #_logger.debug("e = %r" % e) + + #Read differential annotations + _read_differential_annotations(VolumeObject._diff_attr_names, e, self.annos) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn in ["volume", "original_volume"] + + #Look through direct-child elements to populate run array + for ce in e.findall("./*"): + #_logger.debug("ce = %r" % ce) + (cns, ctn) = _qsplit(ce.tag) + #_logger.debug("cns = %r" % cns) + #_logger.debug("ctn = %r" % ctn) + if ctn == "byte_runs": + self.byte_runs = ByteRuns() + self.byte_runs.populate_from_Element(ce) + elif ctn == "original_volume": + self.original_volume = VolumeObject() + self.original_volume.populate_from_Element(ce) + elif ctn in VolumeObject._all_properties: + #_logger.debug("ce.text = %r" % ce.text) + setattr(self, ctn, ce.text) + #_logger.debug("getattr(self, %r) = %r" % (ctn, getattr(self, ctn))) + else: + if (cns, ctn) not in _warned_elements: + _warned_elements.add((cns, ctn)) + _logger.warning("Unsure what to do with this element in a VolumeObject: %r" % ce) + + def print_dfxml(self, output_fh=sys.stdout): + pe = self.to_partial_Element() + dfxml_wrapper = _ET_tostring(pe) + + if len(pe) == 0 and len(self._files) == 0: + output_fh.write(dfxml_wrapper) + return + + dfxml_foot = "" + + #Deal with an empty element being printed as + if len(pe) == 0: + replaced_dfxml_wrapper = dfxml_wrapper.replace(" />", ">") + dfxml_head = replaced_dfxml_wrapper + else: + dfxml_head = dfxml_wrapper.strip()[:-len(dfxml_foot)] + + output_fh.write(dfxml_head) + output_fh.write("\n") + _logger.debug("Writing %d file objects for this volume." % len(self._files)) + for f in self._files: + e = f.to_Element() + output_fh.write(_ET_tostring(e)) + output_fh.write("\n") + output_fh.write(dfxml_foot) + output_fh.write("\n") + + def to_Element(self): + outel = self.to_partial_Element() + for f in self._files: + tmpel = f.to_Element() + outel.append(tmpel) + return outel + + def to_partial_Element(self): + """Returns the volume element with its properties, except for the child fileobjects. Properties are appended in DFXML schema order.""" + outel = ET.Element("volume") + + annos_whittle_set = copy.deepcopy(self.annos) + diffs_whittle_set = copy.deepcopy(self.diffs) + + #Add differential annotations + for annodiff in VolumeObject._diff_attr_names: + if annodiff in annos_whittle_set: + outel.attrib[VolumeObject._diff_attr_names[annodiff]] = "1" + annos_whittle_set.remove(annodiff) + if len(annos_whittle_set) > 0: + _logger.warning("Failed to export some differential annotations: %r." % annos_whittle_set) + + if self.byte_runs: + outel.append(self.byte_runs.to_Element()) + + def _append_el(prop, value): + tmpel = ET.Element(prop) + _keep = False + if not value is None: + tmpel.text = str(value) + _keep = True + if prop in self.diffs: + tmpel.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(prop) + _keep = True + if _keep: + outel.append(tmpel) + + def _append_str(prop): + value = getattr(self, prop) + _append_el(prop, value) + + def _append_bool(prop): + value = getattr(self, prop) + if not value is None: + value = "1" if value else "0" + _append_el(prop, value) + + for prop in [ + "partition_offset", + "sector_size", + "block_size", + "ftype", + "ftype_str", + "block_count", + "first_block", + "last_block" + ]: + _append_str(prop) + + #Output the one Boolean property + _append_bool("allocated_only") + + #Output the original volume's properties + if not self.original_volume is None or "original_volume" in diffs_whittle_set: + #Skip FileObject list, if any + if self.original_volume is None: + tmpel = ET.Element("delta:original_volume") + else: + tmpel = self.original_volume.to_partial_Element() + tmpel.tag = "delta:original_volume" + + if "original_volume" in diffs_whittle_set: + tmpel.attrib["delta:changed_property"] = "1" + + outel.append(tmpel) + + if len(diffs_whittle_set) > 0: + _logger.warning("Did not annotate all of the differing properties of this volume. Remaining properties: %r." % diffs_whittle_set) + + return outel + + @property + def allocated_only(self): + return self._allocated_only + + @allocated_only.setter + def allocated_only(self, val): + self._allocated_only = _boolcast(val) + + @property + def annos(self): + """Set of differential annotations. Expected members are the keys of this class's _diff_attr_names dictionary.""" + return self._annos + + @annos.setter + def annos(self, val): + _typecheck(val, set) + self._annos = val + + @property + def block_count(self): + return self._block_count + + @block_count.setter + def block_count(self, val): + self._block_count = _intcast(val) + + @property + def block_size(self): + return self._block_size + + @block_size.setter + def block_size(self, val): + self._block_size = _intcast(val) + + @property + def diffs(self): + return self._diffs + + @property + def first_block(self): + return self._first_block + + @first_block.setter + def first_block(self, val): + self._first_block = _intcast(val) + + @property + def ftype(self): + return self._ftype + + @ftype.setter + def ftype(self, val): + self._ftype = _intcast(val) + + @property + def ftype_str(self): + return self._ftype_str + + @ftype_str.setter + def ftype_str(self, val): + self._ftype_str = _strcast(val) + + @property + def last_block(self): + return self._last_block + + @last_block.setter + def last_block(self, val): + self._last_block = _intcast(val) + + @property + def original_volume(self): + return self._original_volume + + @original_volume.setter + def original_volume(self, val): + if not val is None: + _typecheck(val, VolumeObject) + self._original_volume= val + + @property + def partition_offset(self): + return self._partition_offset + + @partition_offset.setter + def partition_offset(self, val): + self._partition_offset = _intcast(val) + + @property + def sector_size(self): + return self._sector_size + + @sector_size.setter + def sector_size(self, val): + self._sector_size = _intcast(val) + +class HiveObject(object): + def __init__(self, *args, **kwargs): + self._cells = [] + + def __iter__(self): + """Yields all CellObjects directly attached to this VolumeObject.""" + for c in self._cells: + yield c + + def append(self, value): + _typecheck(value, CellObject) + self._cells.append(value) + + def print_regxml(self, output_fh=sys.stdout): + for cell in self._cells: + output_fh.write(cell.to_regxml()) + output_fh.write("\n") + + def to_Element(self): + outel = ET.Element("hive") + for cell in self._cells: + tmpel = cell.to_Element() + outel.append(tmpel) + return outel + +class ByteRun(object): + + _all_properties = set([ + "img_offset", + "fs_offset", + "file_offset", + "fill", + "len" + ]) + + def __init__(self, *args, **kwargs): + for prop in ByteRun._all_properties: + setattr(self, prop, kwargs.get(prop)) + + def __add__(self, other): + """ + Joins two ByteRun objects into a single run if possible. Returns a new object of the concatenation if successful, None if not. + """ + _typecheck(other, ByteRun) + #Don't glom fills of different values + if self.fill != other.fill: + return None + + if None in [self.len, other.len]: + return None + + for prop in ["img_offset", "fs_offset", "file_offset"]: + if None in [getattr(self, prop), getattr(other, prop)]: + continue + if getattr(self, prop) + self.len == getattr(other, prop): + retval = copy.deepcopy(self) + retval.len += other.len + return retval + return None + + def __eq__(self, other): + #Check type + if other is None: + return False + if not isinstance(other, ByteRun): + if not _warned_byterun_badtypecomp: + _logger.warning("A ByteRun comparison was called against a non-ByteRun object: " + repr(other) + ".") + _warned_byterun_badtypecomp = True + return False + + #Check values + return \ + self.img_offset == other.img_offset and \ + self.fs_offset == other.fs_offset and \ + self.file_offset == other.file_offset and \ + self.fill == other.fill and \ + self.len == other.len + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + for prop in ByteRun._all_properties: + val = getattr(self, prop) + if not val is None: + parts.append("%s=%r" % (prop, val)) + return "ByteRun(" + ", ".join(parts) + ")" + + def populate_from_Element(self, e): + _typecheck(e, (ET.Element, ET.ElementTree)) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn == "byte_run" + + copied_attrib = copy.deepcopy(e.attrib) + + #Populate run properties from element attributes + for prop in ByteRun._all_properties: + if prop in copied_attrib: + val = copied_attrib.get(prop) + if not val is None: + setattr(self, prop, val) + del copied_attrib[prop] + #Note remaining properties + for prop in copied_attrib: + if prop not in _warned_byterun_attribs: + _warned_byterun_attribs.add(prop) + _logger.warning("No instructions present for processing this attribute found on a byte run: %r." % prop) + + def to_Element(self): + outel = ET.Element("byte_run") + for prop in ByteRun._all_properties: + val = getattr(self, prop) + if not val is None: + outel.attrib[prop] = str(val) + return outel + + @property + def file_offset(self): + return self._file_offset + + @file_offset.setter + def file_offset(self, val): + self._file_offset = _intcast(val) + + @property + def fill(self): + """There is an implicit assumption that the fill character is encoded as UTF-8.""" + return self._fill + + @fill.setter + def fill(self, val): + self._fill = _bytecast(val) + + @property + def fs_offset(self): + return self._fs_offset + + @fs_offset.setter + def fs_offset(self, val): + self._fs_offset = _intcast(val) + + @property + def img_offset(self): + return self._img_offset + + @img_offset.setter + def img_offset(self, val): + self._img_offset = _intcast(val) + + @property + def len(self): + return self._len + + @len.setter + def len(self, val): + self._len = _intcast(val) + +class ByteRuns(object): + """ + A list-like object for ByteRun objects. + """ + #Must define these methods to adhere to the list protocol: + #__len__ + #__getitem__ + #__setitem__ + #__delitem__ + #__iter__ + #append + # + #Refs: + #http://www.rafekettler.com/magicmethods.html + #http://stackoverflow.com/a/8841520 + + _facet_values = [None, "data", "inode", "name"] + + def __init__(self, run_list=None, **kwargs): + self._facet = kwargs.get("facet") + self._listdata = [] + if isinstance(run_list, list): + for run in run_list: + self.append(run) + + def __delitem__(self, key): + del self._listdata[key] + + def __eq__(self, other): + """Compares the byte run lists and the facet (allowing a null facet to match "data").""" + #Check type + if other is None: + return False + _typecheck(other, ByteRuns) + + if self.facet != other.facet: + if set([self.facet, other.facet]) != set([None, "data"]): + return False + if len(self) != len(other): + #_logger.debug("len(self) = %d" % len(self)) + #_logger.debug("len(other) = %d" % len(other)) + return False + for (sbr_index, sbr) in enumerate(self): + obr = other[sbr_index] + #_logger.debug("sbr_index = %d" % sbr_index) + #_logger.debug("sbr = %r" % sbr) + #_logger.debug("obr = %r" % obr) + if sbr != obr: + return False + return True + + def __getitem__(self, key): + return self._listdata.__getitem__(key) + + def __iter__(self): + return iter(self._listdata) + + def __len__(self): + return self._listdata.__len__() + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + for run in self: + parts.append(repr(run)) + maybe_facet = "" + if self.facet: + maybe_facet = "facet=%r, " % self.facet + return "ByteRuns(" + maybe_facet + "run_list=[" + ", ".join(parts) + "])" + + def __setitem__(self, key, value): + _typecheck(value, ByteRun) + self._listdata[key] = value + + def append(self, value): + """ + Appends a ByteRun object to this container's list. + """ + _typecheck(value, ByteRun) + self._listdata.append(value) + + def glom(self, value): + """ + Appends a ByteRun object to this container's list, after attempting to join the run with the last run already stored. + """ + _typecheck(value, ByteRun) + if len(self._listdata) == 0: + self.append(value) + else: + last_run = self._listdata[-1] + maybe_new_run = last_run + value + if maybe_new_run is None: + self.append(value) + else: + self._listdata[-1] = maybe_new_run + + def iter_contents(self, raw_image, buffer_size=1048576, sector_size=512, errlog=None, statlog=None): + """ + Generator. Yields contents, as byte strings one block at a time, given a backing raw image path. Relies on The SleuthKit's img_cat, so contents can be extracted from any disk image type that TSK supports. + @param buffer_size The maximum size of the byte strings yielded. + @param sector_size The size of a disk sector in the raw image. Required by img_cat. + """ + if not isinstance(raw_image, str): + raise TypeError("iter_contents needs the string path to the image file. Received: %r." % raw_image) + + stderr_fh = None + if not errlog is None: + stderr_fh = open(errlog, "wb") + + status_fh = None + if not statlog is None: + status_fh = open(errlog, "wb") + + #The exit status of the last img_cat. + last_status = None + + try: + for run in self: + if run.len is None: + raise AttributeError("Byte runs can't be extracted if a run length is undefined.") + + len_to_read = run.len + + #If we have a fill character, just pump out that character + if not run.fill is None and len(run.fill) > 0: + while len_to_read > 0: + #This multiplication and slice should handle multi-byte fill characters, in case that ever comes up. + yield (run.fill * buffer_size)[ : min(len_to_read, buffer_size)] + len_to_read -= buffer_size + #Next byte run + continue + + if run.img_offset is None: + raise AttributeError("Byte runs can't be extracted if missing a fill character and image offset.") + + cmd = ["img_cat"] + cmd.append("-b") + cmd.append(str(sector_size)) + cmd.append("-s") + cmd.append(str(run.img_offset//sector_size)) + cmd.append("-e") + cmd.append(str( (run.img_offset + run.len)//sector_size)) + cmd.append(raw_image) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=stderr_fh) + + #Do the buffered read + while len_to_read > 0: + buffer_data = p.stdout.read(buffer_size) + yield_data = buffer_data[ : min(len_to_read, buffer_size)] + if len(yield_data) > 0: + yield yield_data + else: + #Let the subprocess terminate so we can see the exit status + p.wait() + last_status = p.returncode + if last_status != 0: + raise subprocess.CalledProcessError(last_status, " ".join(cmd), "img_cat failed.") + len_to_read -= buffer_size + except Exception as e: + #Cleanup in an exception + if not stderr_fh is None: + stderr_fh.close() + + if not status_fh is None: + if isinstance(e, subprocess.CalledProcessError): + status_fh.write(e.returncode) + else: + status_fh.write("1") + status_fh.close() + raise e + + #Cleanup when all's gone well. + if not status_fh is None: + if not last_status is None: + status_fh.write(last_status) + status_fh.close() + if not stderr_fh is None: + stderr_fh.close() + + def populate_from_Element(self, e): + _typecheck(e, (ET.Element, ET.ElementTree)) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn == "byte_runs" + + if "facet" in e.attrib: + self.facet = e.attrib["facet"] + + #Look through direct-child elements to populate run array + for ce in e.findall("./*"): + (cns, ctn) = _qsplit(ce.tag) + if ctn == "byte_run": + nbr = ByteRun() + nbr.populate_from_Element(ce) + self.append(nbr) + + def to_Element(self): + outel = ET.Element("byte_runs") + for run in self: + tmpel = run.to_Element() + outel.append(tmpel) + if self.facet: + outel.attrib["facet"] = self.facet + return outel + + @property + def facet(self): + """Expected to be null, "data", "inode", or "name". See FileObject.data_brs, FileObject.inode_brs, and FileObject.name_brs.""" + return self._facet + + @facet.setter + def facet(self, val): + if not val is None: + _typecheck(val, str) + if val not in ByteRuns._facet_values: + raise ValueError("A ByteRuns facet must be one of these: %r. Received: %r." % (ByteRuns._facet_values, val)) + self._facet = val + +re_precision = re.compile(r"(?P\d+)(?P(|m|n)s|d)?") +class TimestampObject(object): + """ + Encodes the "dftime" type. Wraps around dfxml.dftime, closely enough that this might just get folded into that class. + + TimestampObjects implement a vs-null comparison workaround as in the SAS family of products: Null, for ordering purposes, is considered to be a value less than negative infinity. + """ + + timestamp_name_list = ["mtime", "atime", "ctime", "crtime", "dtime", "bkup_time"] + + def __init__(self, *args, **kwargs): + self.name = kwargs.get("name") + self.prec = kwargs.get("prec") + #_logger.debug("type(args) = %r" % type(args)) + #_logger.debug("args = %r" % (args,)) + if len(args) == 0: + self.time = None + elif len(args) == 1: + self.time = args[0] + else: + raise ValueError("Unexpected arguments. Whole args tuple: %r." % (args,)) + + self._timestamp = None + + def __eq__(self, other): + #Check type + if other is None: + return False + _typecheck(other, TimestampObject) + + if self.name != other.name: + return False + if self.prec != other.prec: + return False + if self.time != other.time: + return False + return True + + def __ge__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return False + else: + self._comparison_sanity_check(other) + return self.time.__ge__(other.time) + + def __gt__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return False + else: + self._comparison_sanity_check(other) + return self.time.__gt__(other.time) + + def __le__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return True + else: + self._comparison_sanity_check(other) + return self.time.__le__(other.time) + + def __lt__(self, other): + """Note: The semantics here and in other ordering functions are that "Null" is a value less than negative infinity.""" + if other is None: + return True + else: + self._comparison_sanity_check(other) + return self.time.__lt__(other.time) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + if self.name: + parts.append("name=%r" % self.name) + if self.prec: + parts.append("prec=%r" % (self.prec,)) + if self.time: + parts.append("%r" % self.time) + return "TimestampObject(" + ", ".join(parts) + ")" + + def __str__(self): + if self.time: + return str(self.time) + else: + return self.__repr__() + + def _comparison_sanity_check(self, other): + if None in (self.time, other.time): + raise ValueError("Can't compare TimestampObjects: %r, %r." % self, other) + + def populate_from_Element(self, e): + _typecheck(e, (ET.Element, ET.ElementTree)) + if "prec" in e.attrib: + self.prec = e.attrib["prec"] + self.time = e.text + (ns, tn) = _qsplit(e.tag) + self.name = tn + + def to_Element(self): + _typecheck(self.name, str) + outel = ET.Element(self.name) + if self.prec: + outel.attrib["prec"] = "%d%s" % self.prec + if self.time: + outel.text = str(self.time) + return outel + + @property + def name(self): + """The type of timestamp - modified (mtime), accessed (atime), etc.""" + return self._name + + @name.setter + def name(self, value): + if not value is None: + if not value in TimestampObject.timestamp_name_list: + raise ValueError("The timestamp name must be in this list: %r. Received: %r." % (TimestampObject.timestamp_name_list, value)) + self._name = value + + @property + def prec(self): + """ + A pair, (resolution, unit); unit is a second (s), millisecond, nanosecond, or day (d). The default unit is "s". Can be passed as a string or a duple. + """ + return self._prec + + @prec.setter + def prec(self, value): + if value is None: + self._prec = None + return self._prec + elif isinstance(value, tuple) and \ + len(value) == 2 and \ + isinstance(value[0], int) and \ + isinstance(value[1], str): + self._prec = value + return self._prec + + m = re_precision.match(value) + md = m.groupdict() + tup = (int(md["num"]), md.get("unit") or "s") + #_logger.debug("tup = %r" % (tup,)) + self._prec = tup + + @property + def time(self): + """ + The actual timestamp. A DFXML.dftime object. This class might be superfluous and end up collapsing into that... + """ + return self._time + + @time.setter + def time(self, value): + if value is None: + self._time = None + else: + checked_value = dfxml.dftime(value) + #_logger.debug("checked_value.timestamp() = %r" % checked_value.timestamp()) + self._time = checked_value + #Propagate timestamp value to other formats + self._timestamp = self._time.timestamp() + + @property + def timestamp(self): + """A Unix floating-point timestamp, as time.mktime returns. Currently, there is no setter for this property.""" + return self._timestamp + + +class FileObject(object): + """ + This class provides property accesses, an XML serializer (ElementTree-based), and a deserializer. + The properties interface is NOT function calls, but simple accesses. That is, the old _fileobject_ style: + + assert isinstance(fi, dfxml.fileobject) + fi.mtime() + + is now replaced with: + + assert isinstance(fi, Objects.FileObject) + fi.mtime + """ + + _all_properties = set([ + "alloc", + "alloc_inode", + "alloc_name", + "annos", + "atime", + "bkup_time", + "byte_runs", + "compressed", + "crtime", + "ctime", + "data_brs", + "dtime", + "error", + "filename", + "filesize", + "gid", + "id", + "inode", + "inode_brs", + "link_target", + "libmagic", + "md5", + "meta_type", + "mode", + "mtime", + "name_brs", + "name_type", + "nlink", + "original_fileobject", + "orphan", + "parent_object", + "partition", + "seq", + "sha1", + "uid", + "unalloc", + "unused", + "used" + ]) + + _br_facet_to_property = { + "data":"data_brs", + "inode":"inode_brs", + "name":"name_brs" + } + + #TODO There may be need in the future to compare the annotations as well. It complicates make_differential_dfxml too much for now. + _incomparable_properties = set([ + "annos", + "byte_runs", + "id", + "unalloc", + "unused" + ]) + + _diff_attr_names = { + "new":"delta:new_file", + "deleted":"delta:deleted_file", + "renamed":"delta:renamed_file", + "changed":"delta:changed_file", + "modified":"delta:modified_file", + "matched":"delta:matched" + } + + def __init__(self, *args, **kwargs): + #Prime all the properties + for prop in FileObject._all_properties: + if prop == "annos": + continue + setattr(self, prop, kwargs.get(prop)) + self._annos = set() + self._diffs = set() + + def __eq__(self, other): + if other is None: + return False + _typecheck(other, FileObject) + for prop in FileObject._all_properties: + if prop in FileObject._incomparable_properties: + continue + if getattr(self, prop) != getattr(other, prop): + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + + for prop in sorted(FileObject._all_properties): + #Save data byte runs for the end, as theirs lists can get really long. + if prop not in ["byte_runs", "data_brs"]: + value = getattr(self, prop) + if not value is None: + parts.append("%s=%r" % (prop, value)) + + if self.data_brs: + parts.append("data_brs=%r" % self.byte_runs) + + return "FileObject(" + ", ".join(parts) + ")" + + def compare_to_original(self): + self._diffs = self.compare_to_other(self.original_fileobject, True) + + def compare_to_other(self, other, ignore_original=False): + _typecheck(other, FileObject) + + diffs = set() + + for propname in FileObject._all_properties: + if propname in FileObject._incomparable_properties: + continue + if ignore_original and propname == "original_fileobject": + continue + oval = getattr(other, propname) + sval = getattr(self, propname) + if oval is None and sval is None: + continue + if oval != sval: + #_logger.debug("propname, oval, sval: %r, %r, %r" % (propname, oval, sval)) + diffs.add(propname) + + return diffs + + def extract_facet(self, facet, image_path=None, buffer_size=1048576, partition_offset=None, sector_size=512, errlog=None, statlog=None, icat_threshold = 268435456): + """ + Generator. Extracts the facet with a SleuthKit tool, yielding chunks of the data. + + @param buffer_size The facet data is yielded in chunks of at most this parameter's size. Default 1MiB. + @param partition_offset The offset of the file's containing partition, in bytes. Needed for icat. If not given, the FileObject's VolumeObject will be used. If that's also absent, icat can't be used, and img_cat will instead be tried as a fallback (which means byte runs must be in the DFXML). + @param icat_threshold icat incurs extensive, non-sequential IO overhead to walk the filesystem to reach the facet's byte runs. img_cat can be called on each byte run reported in the DFXML file, but on fragmented files this incurs overhead in process spawning. Facets larger than this threshold are extracted with icat. Default 256MiB. Force icat by setting this to -1; force img_cat with infinity (float("inf")). + """ + + _image_path = image_path + if _image_path is None: + raise ValueError("The backing image path must be supplied.") + + _partition_offset = partition_offset + if _partition_offset is None: + if self.volume_object: + _partition_offset = self.volume_object.partition_offset + + #Try using icat; needs inode number and volume offset. We're additionally requiring the filesize be known. + #TODO The icat needs a little more experimentation. + if False and facet == "content" and \ + not self.filesize is None and \ + self.filesize >= icat_threshold and \ + not self.inode is None and \ + not _partition_offset is None: + _logger.debug("Extracting with icat: %r." % self) + + #Set up logging if desired + stderr_fh = sys.stderr + if not errlog is None: + stderr_fh = open(errlog, "wb") + + status_fh = None + if not statlog is None: + status_fh = open(errlog, "w") + + #Set up icat process + cmd = ["icat"] + cmd.append("-b") + cmd.append(str(sector_size)) + cmd.append("-o") + cmd.append(str(self.volume_object.partition_offset//sector_size)) + if not self.volume_object.ftype_str is None: + cmd.append("-f") + cmd.append(self.volume_object.ftype_str) + cmd.append(image_path) + cmd.append(str(self.inode)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=stderr_fh) + + #Do a buffered read + len_to_read = self.filesize + while len_to_read > 0: + buffer_data = p.stdout.read(buffer_size) + yield_data = buffer_data[ : min(len_to_read, buffer_size)] + if len(yield_data) > 0: + yield yield_data + else: + #Let the subprocess terminate so we can see the exit status + p.wait() + last_status = p.returncode + + #Log the status if requested + if not status_fh is None: + status_fh.write(last_status) + + #Act on a bad status + if last_status != 0: + raise subprocess.CalledProcessError(last_status, " ".join(cmd), "icat failed.") + len_to_read -= buffer_size + + #Clean up file handles + if status_fh: status_fh.close() + if stderr_fh: stderr_fh.close() + + elif not self.byte_runs is None: + for chunk in self.byte_runs.iter_contents(_image_path, buffer_size, sector_size, errlog, statlog): + yield chunk + + def populate_from_Element(self, e): + """Populates this FileObject's properties from an ElementTree Element. The Element need not be retained.""" + global _warned_elements + _typecheck(e, (ET.Element, ET.ElementTree)) + + #_logger.debug("FileObject.populate_from_Element(%r)" % e) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn in ["fileobject", "original_fileobject", "parent_object"] + + #Map "delta:" attributes of s into the self.annos set + #_logger.debug("self.annos, before: %r." % self.annos) + _read_differential_annotations(FileObject._diff_attr_names, e, self.annos) + #_logger.debug("self.annos, after: %r." % self.annos) + + #Look through direct-child elements for other properties + for ce in e.findall("./*"): + (cns, ctn) = _qsplit(ce.tag) + #_logger.debug("Populating from child element: %r." % ce.tag) + + #Inherit any marked changes + for attr in ce.attrib: + #_logger.debug("Inspecting attr for diff. annos: %r." % attr) + (ns, an) = _qsplit(attr) + if an == "changed_property" and ns == dfxml.XMLNS_DELTA: + #_logger.debug("Identified changed property: %r." % ctn) + #TODO There may be a more elegant way of handling the hashes and any other attribute-dependent element-to-property mapping. Probably involving XPath. + if ctn == "hashdigest": + if "type" not in ce.attrib: + raise AttributeError("Attribute 'type' not found. Every hashdigest element should have a 'type' attribute to identify the hash type.") + self.diffs.add(ce.attrib["type"].lower()) + elif ctn == "byte_runs": + facet = ce.attrib.get("facet") + prop = FileObject._br_facet_to_property.get(facet, "data_brs") + self.diffs.add(prop) + else: + self.diffs.add(ctn) + + if ctn == "byte_runs": + #byte_runs might be for file contents, the inode/MFT entry, or the directory entry naming the file. Use the facet attribute to determine which. If facet is absent, assume they're data byte runs. + if "facet" in ce.attrib: + if ce.attrib["facet"] not in FileObject._br_facet_to_property: + if not ce.attrib["facet"] in _warned_byterun_facets: + _warned_byterun_facets.add(ce.attrib["facet"]) + _logger.warning("byte_runs facet %r was unexpected. Will not interpret this element.") + else: + brs = ByteRuns() + brs.populate_from_Element(ce) + brs.facet = ce.attrib["facet"] + setattr(self, FileObject._br_facet_to_property[brs.facet], brs) + else: + self.byte_runs = ByteRuns() + self.byte_runs.populate_from_Element(ce) + elif ctn == "hashdigest": + if ce.attrib["type"].lower() == "md5": + self.md5 = ce.text + elif ce.attrib["type"].lower() == "sha1": + self.sha1 = ce.text + elif ctn == "original_fileobject": + self.original_fileobject = FileObject() + self.original_fileobject.populate_from_Element(ce) + elif ctn == "parent_object": + self.parent_object = FileObject() + self.parent_object.populate_from_Element(ce) + elif ctn in ["atime", "bkup_time", "crtime", "ctime", "dtime", "mtime"]: + setattr(self, ctn, TimestampObject()) + getattr(self, ctn).populate_from_Element(ce) + elif ctn in FileObject._all_properties: + setattr(self, ctn, ce.text) + else: + if (cns, ctn) not in _warned_elements: + _warned_elements.add((cns, ctn)) + _logger.warning("Uncertain what to do with this element: %r" % ce) + + def populate_from_stat(self, s): + """Populates FileObject fields from a stat() call.""" + import os + _typecheck(s, os.stat_result) + + self.mode = s.st_mode + self.inode = s.st_ino + self.nlink = s.st_nlink + self.uid = s.st_uid + self.gid = s.st_gid + self.filesize = s.st_size + #s.st_dev is ignored for now. + + if "st_mtime" in dir(s): + self.mtime = s.st_mtime + + if "st_atime" in dir(s): + self.atime = s.st_atime + + if "st_ctime" in dir(s): + self.ctime = s.st_ctime + + if "st_birthtime" in dir(s): + self.crtime = s.st_birthtime + + def to_Element(self): + """Creates an ElementTree Element with elements in DFXML schema order.""" + outel = ET.Element("fileobject") + + annos_whittle_set = copy.deepcopy(self.annos) + diffs_whittle_set = copy.deepcopy(self.diffs) + + for annodiff in FileObject._diff_attr_names: + if annodiff in annos_whittle_set: + outel.attrib[FileObject._diff_attr_names[annodiff]] = "1" + annos_whittle_set.remove(annodiff) + if len(annos_whittle_set) > 0: + _logger.warning("Failed to export some differential annotations: %r." % annos_whittle_set) + + def _anno_change(el): + if el.tag in self.diffs: + el.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(el.tag) + + def _anno_hash(el): + if el.attrib["type"] in self.diffs: + el.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(el.attrib["type"]) + + def _anno_byte_runs(el): + if "facet" in el.attrib: + prop = FileObject._br_facet_to_property[el.attrib["facet"]] + else: + prop = "data_brs" + if prop in self.diffs: + el.attrib["delta:changed_property"] = "1" + #_logger.debug("diffs_whittle_set = %r." % diffs_whittle_set) + diffs_whittle_set.remove(prop) + + #Recall that Element text must be a string + def _append_str(name, value): + """Note that empty elements should be created if the element was removed.""" + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element(name) + if not value is None: + tmpel.text = str(value) + _anno_change(tmpel) + outel.append(tmpel) + + def _append_time(name, value): + """Note that empty elements should be created if the element was removed.""" + if not value is None or name in diffs_whittle_set: + if not value is None and value.time: + tmpel = value.to_Element() + else: + tmpel = ET.Element(name) + _anno_change(tmpel) + outel.append(tmpel) + + def _append_bool(name, value): + """Note that empty elements should be created if the element was removed.""" + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element(name) + if not value is None: + tmpel.text = str(1 if value else 0) + _anno_change(tmpel) + outel.append(tmpel) + + _using_facets = False + def _append_byte_runs(name, value): + """The complicated part here is setting the "data" facet on the byte runs, because we assume that no facet definitions means that for this file, there's only the one byte_runs list for data.""" + #_logger.debug("_append_byte_runs(%r, %r)" % (name, value)) + if value or name in diffs_whittle_set: + if value: + tmpel = value.to_Element() + if "facet" in tmpel.attrib: + _using_facets = True + else: + tmpel = ET.Element("byte_runs") + propname_to_facet = { + "data_brs": "data", + "inode_brs": "inode", + "name_brs": "name" + } + if name in propname_to_facet: + _using_facets = True + tmpel.attrib["facet"] = propname_to_facet[name] + elif _using_facets: + tmpel.attrib["facet"] = propname_to_facet["data_brs"] + _anno_byte_runs(tmpel) + outel.append(tmpel) + + def _append_object(name, value, namespace_prefix=None): + """name must be the name of a property that has a to_Element() method. namespace_prefix will be prepended as-is to the element tag.""" + obj = value + if obj or name in diffs_whittle_set: + if obj: + tmpel = obj.to_Element() + else: + tmpel = ET.Element(name) + #Set the tag name here for properties like parent_object, a FileObject without being wholly a FileObject. + if namespace_prefix: + tmpel.tag = namespace_prefix + name + else: + tmpel.tag = name + _anno_change(tmpel) + outel.append(tmpel) + + def _append_hash(name, value): + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element("hashdigest") + tmpel.attrib["type"] = name + if not value is None: + tmpel.text = value + _anno_hash(tmpel) + outel.append(tmpel) + + #The parent object is a one-off. Duplicating the whole parent is wasteful, so create a shadow object that just outputs the important bits. + if not self.parent_object is None: + parent_object_shadow = FileObject() + parent_object_shadow.inode = self.parent_object.inode + _append_object("parent_object", parent_object_shadow) + + _append_str("filename", self.filename) + _append_str("error", self.error) + _append_str("partition", self.partition) + _append_str("id", self.id) + _append_str("name_type", self.name_type) + _append_str("filesize", self.filesize) + #TODO Define a better flag for if we're going to output elements. + if self.alloc_name is None and self.alloc_inode is None: + _append_bool("alloc", self.alloc) + else: + _append_bool("alloc_inode", self.alloc_inode) + _append_bool("alloc_name", self.alloc_name) + _append_bool("used", self.used) + _append_bool("orphan", self.orphan) + _append_bool("compressed", self.compressed) + _append_str("inode", self.inode) + _append_str("meta_type", self.meta_type) + _append_str("mode", self.mode) + _append_str("nlink", self.nlink) + _append_str("uid", self.uid) + _append_str("gid", self.gid) + _append_time("mtime", self.mtime) + _append_time("ctime", self.ctime) + _append_time("atime", self.atime) + _append_time("crtime", self.crtime) + _append_str("seq", self.seq) + _append_time("dtime", self.dtime) + _append_time("bkup_time", self.bkup_time) + _append_str("link_target", self.link_target) + _append_str("libmagic", self.libmagic) + _append_byte_runs("inode_brs", self.inode_brs) + _append_byte_runs("name_brs", self.name_brs) + _append_byte_runs("data_brs", self.data_brs) + _append_hash("md5", self.md5) + _append_hash("sha1", self.sha1) + _append_object("original_fileobject", self.original_fileobject, "delta:") + + if len(diffs_whittle_set) > 0: + _logger.warning("Did not annotate all of the differing properties of this file. Remaining properties: %r." % diffs_whittle_set) + + return outel + + def to_dfxml(self): + return _ET_tostring(self.to_Element()) + + @property + def alloc(self): + """Note that setting .alloc will affect the value of .unalloc, and vice versa. The last one to set wins.""" + global _nagged_alloc + if not _nagged_alloc: + _logger.warning("The FileObject.alloc property is deprecated. Use .alloc_inode and/or .alloc_name instead. .alloc is proxied as True if alloc_inode and alloc_name are both True.") + _nagged_alloc = True + if self.alloc_inode and self.alloc_name: + return True + else: + return self._alloc + + @alloc.setter + def alloc(self, val): + self._alloc = _boolcast(val) + if not self._alloc is None: + self._unalloc = not self._alloc + + @property + def alloc_inode(self): + return self._alloc_inode + + @alloc_inode.setter + def alloc_inode(self, val): + self._alloc_inode = _boolcast(val) + + @property + def alloc_name(self): + return self._alloc_name + + @alloc_name.setter + def alloc_name(self, val): + self._alloc_name = _boolcast(val) + + @property + def annos(self): + """Set of differential annotations. Expected members are the keys of this class's _diff_attr_names dictionary.""" + return self._annos + + @annos.setter + def annos(self, val): + _typecheck(val, set) + self._annos = val + + @property + def atime(self): + return self._atime + + @atime.setter + def atime(self, val): + if val is None: + self._atime = None + elif isinstance(val, TimestampObject): + self._atime = val + else: + checked_val = TimestampObject(val, name="atime") + self._atime = checked_val + + @property + def bkup_time(self): + return self._bkup_time + + @bkup_time.setter + def bkup_time(self, val): + if val is None: + self._bkup_time = None + elif isinstance(val, TimestampObject): + self._bkup_time = val + else: + checked_val = TimestampObject(val, name="bkup_time") + self._bkup_time = checked_val + + @property + def byte_runs(self): + """This property is now a synonym for the data byte runs (.data_brs).""" + return self.data_brs + + @byte_runs.setter + def byte_runs(self, val): + self.data_brs = val + + @property + def compressed(self): + return self._compressed + + @compressed.setter + def compressed(self, val): + self._compressed = _boolcast(val) + + @property + def ctime(self): + return self._ctime + + @ctime.setter + def ctime(self, val): + if val is None: + self._ctime = None + elif isinstance(val, TimestampObject): + self._ctime = val + else: + checked_val = TimestampObject(val, name="ctime") + self._ctime = checked_val + + @property + def crtime(self): + return self._crtime + + @crtime.setter + def crtime(self, val): + if val is None: + self._crtime = None + elif isinstance(val, TimestampObject): + self._crtime = val + else: + checked_val = TimestampObject(val, name="crtime") + self._crtime = checked_val + + @property + def data_brs(self): + """The byte runs that store the file's content.""" + return self._data_brs + + @data_brs.setter + def data_brs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._data_brs = val + + @property + def diffs(self): + """This property intentionally has no setter. To populate, call compare_to_original() after assigning an original_fileobject.""" + return self._diffs + + @property + def dtime(self): + return self._dtime + + @dtime.setter + def dtime(self, val): + if val is None: + self._dtime = None + elif isinstance(val, TimestampObject): + self._dtime = val + else: + checked_val = TimestampObject(val, name="dtime") + self._dtime = checked_val + + @property + def error(self): + return self._error + + @error.setter + def error(self, val): + self._error = _strcast(val) + + @property + def filesize(self): + return self._filesize + + @filesize.setter + def filesize(self, val): + self._filesize = _intcast(val) + + @property + def gid(self): + return self._gid + + @gid.setter + def gid(self, val): + self._gid = _strcast(val) + + @property + def id(self): + return self._id + + @id.setter + def id(self, val): + self._id = _intcast(val) + + @property + def inode(self): + return self._inode + + @inode.setter + def inode(self, val): + self._inode = _intcast(val) + + @property + def libmagic(self): + return self._libmagic + + @libmagic.setter + def libmagic(self, val): + self._libmagic = _strcast(val) + + @property + def inode_brs(self): + """The byte run(s) that represents the file's metadata object (the inode or the MFT entry). In file systems that do not distinguish between inode and directory entry, e.g. FAT, .inode_brs should be equivalent to .name_brs, if both fields are present.""" + return self._inode_brs + + @inode_brs.setter + def inode_brs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._inode_brs = val + + @property + def meta_type(self): + return self._meta_type + + @meta_type.setter + def meta_type(self, val): + self._meta_type = _intcast(val) + + @property + def mode(self): + """The security mode is represented in the FileObject as a base-10 integer. It is also serialized as a decimal integer.""" + return self._mode + + @mode.setter + def mode(self, val): + self._mode = _intcast(val) + + @property + def mtime(self): + return self._mtime + + @mtime.setter + def mtime(self, val): + if val is None: + self._mtime = None + elif isinstance(val, TimestampObject): + self._mtime = val + else: + checked_val = TimestampObject(val, name="mtime") + self._mtime = checked_val + + @property + def name_brs(self): + """The byte run(s) that represents the file's name object (the directory entry). In file systems that do not distinguish between inode and directory entry, e.g. FAT, .inode_brs should be equivalent to .name_brs, if both fields are present.""" + return self._name_brs + + @name_brs.setter + def name_brs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._name_brs = val + + @property + def name_type(self): + return self._name_type + + @name_type.setter + def name_type(self, val): + if val is None: + self._name_type = val + else: + cast_val = _strcast(val) + if cast_val not in ["-", "p", "c", "d", "b", "r", "l", "s", "h", "w", "v"]: + raise ValueError("Unexpected name_type received: %r (casted to %r)." % (val, cast_val)) + self._name_type = cast_val + + @property + def nlink(self): + return self._nlink + + @nlink.setter + def nlink(self, val): + self._nlink = _intcast(val) + + @property + def orphan(self): + return self._orphan + + @orphan.setter + def orphan(self, val): + self._orphan = _boolcast(val) + + @property + def original_fileobject(self): + return self._original_fileobject + + @original_fileobject.setter + def original_fileobject(self, val): + if not val is None: + _typecheck(val, FileObject) + self._original_fileobject = val + + @property + def partition(self): + return self._partition + + @partition.setter + def partition(self, val): + self._partition = _intcast(val) + + @property + def parent_object(self): + """This object is an extremely sparse FileObject, containing just identifying information. Alternately, it can be an entire object reference to the parent Object, though uniqueness should be checked.""" + return self._parent_object + + @parent_object.setter + def parent_object(self, val): + if not val is None: + _typecheck(val, FileObject) + self._parent_object = val + + @property + def seq(self): + return self._seq + + @seq.setter + def seq(self, val): + self._seq = _intcast(val) + + @property + def uid(self): + return self._uid + + @uid.setter + def uid(self, val): + self._uid = _strcast(val) + + @property + def unalloc(self): + """Note that setting .unalloc will affect the value of .alloc, and vice versa. The last one to set wins.""" + return self._unalloc + + @unalloc.setter + def unalloc(self, val): + self._unalloc = _boolcast(val) + if not self._unalloc is None: + self._alloc = not self._unalloc + + @property + def unused(self): + return self._used + + @unused.setter + def unused(self, val): + self._unused = _intcast(val) + if not self._unused is None: + self._used = not self._unused + + @property + def used(self): + return self._used + + @used.setter + def used(self, val): + self._used = _intcast(val) + if not self._used is None: + self._unused = not self._used + + @property + def volume_object(self): + """Reference to the containing volume object. Not meant to be propagated with __repr__ or to_Element().""" + return self._volume_object + + @volume_object.setter + def volume_object(self, val): + if not val is None: + _typecheck(val, VolumeObject) + self._volume_object = val + + +class CellObject(object): + + _all_properties = set([ + "alloc", + "annos", + "byte_runs", + "cellpath", + "mtime", + "name", + "name_type", + "original_cellobject", + "parent_object", + "root" + ]) + + _diff_attr_names = { + "new":"delta:new_cell", + "deleted":"delta:deleted_cell", + "changed":"delta:changed_cell", + "modified":"delta:modified_cell", + "matched":"delta:matched" + } + + #TODO There may be need in the future to compare the annotations as well. + _incomparable_properties = set([ + "annos" + ]) + + def __init__(self, *args, **kwargs): + #These properties must be assigned first for sanity check dependencies + self.name_type = kwargs.get("name_type") + + for prop in CellObject._all_properties: + if prop == "annos": + setattr(self, prop, kwargs.get(prop, set())) + else: + setattr(self, prop, kwargs.get(prop)) + + self._diffs = set() + + def __eq__(self, other): + if other is None: + return False + _typecheck(other, CellObject) + for prop in CellObject._all_properties: + if prop in CellObject._incomparable_properties: + continue + if getattr(self, prop) != getattr(other, prop): + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + parts = [] + + for prop in sorted(list(CellObject._all_properties)): + if not getattr(self, prop) is None: + parts.append("%s=%r" % (prop, getattr(self, prop))) + + return "CellObject(" + ", ".join(parts) + ")" + + def compare_to_original(self): + self._diffs = self.compare_to_other(self.original_cellobject, True) + + def compare_to_other(self, other, ignore_original=False): + _typecheck(other, CellObject) + + diffs = set() + + for propname in CellObject._all_properties: + if propname in CellObject._incomparable_properties: + continue + if ignore_original and propname == "original_cellobject": + continue + oval = getattr(other, propname) + sval = getattr(self, propname) + if oval is None and sval is None: + continue + if oval != sval: + #_logger.debug("propname, oval, sval: %r, %r, %r" % (propname, oval, sval)) + diffs.add(propname) + + return diffs + + def populate_from_Element(self, e): + """Populates this CellObject's properties from an ElementTree Element. The Element need not be retained.""" + global _warned_elements + _typecheck(e, (ET.Element, ET.ElementTree)) + + _read_differential_annotations(CellObject._diff_attr_names, e, self.annos) + + #Split into namespace and tagname + (ns, tn) = _qsplit(e.tag) + assert tn in ["cellobject", "original_cellobject", "parent_object"] + + if e.attrib.get("root"): + self.root = e.attrib["root"] + + #Look through direct-child elements for other properties + for ce in e.findall("./*"): + (cns, ctn) = _qsplit(ce.tag) + if ctn == "alloc": + self.alloc = ce.text + elif ctn == "byte_runs": + self.byte_runs = ByteRuns() + self.byte_runs.populate_from_Element(ce) + elif ctn == "cellpath": + self.cellpath = ce.text + elif ctn == "mtime": + self.mtime = TimestampObject() + self.mtime.populate_from_Element(ce) + elif ctn == "name": + self.name = ce.text + elif ctn == "name_type": + self.name_type = ce.text + elif ctn == "original_cellobject": + self.original_cellobject = CellObject() + self.original_cellobject.populate_from_Element(ce) + elif ctn == "parent_object": + self.parent_object = CellObject() + self.parent_object.populate_from_Element(ce) + else: + if (cns, ctn) not in _warned_elements: + _warned_elements.add((cns, ctn)) + _logger.warning("Uncertain what to do with this element: %r" % ce) + + self.sanity_check() + + def sanity_check(self): + if self.name_type and self.name_type != "k": + if self.mtime: + _logger.info("Error occurred sanity-checking this CellObject: %r." % self) + raise ValueError("A Registry Key (node) is the only kind of CellObject that can have a timestamp.") + if self.root: + _logger.info("Error occurred sanity-checking this CellObject: %r." % self) + raise ValueError("A Registry Key (node) is the only kind of CellObject that can have the 'root' attribute.") + + def to_Element(self): + self.sanity_check() + + outel = ET.Element("cellobject") + + annos_whittle_set = copy.deepcopy(self.annos) + diffs_whittle_set = copy.deepcopy(self.diffs) + + for annodiff in CellObject._diff_attr_names: + if annodiff in annos_whittle_set: + outel.attrib[CellObject._diff_attr_names[annodiff]] = "1" + annos_whittle_set.remove(annodiff) + if len(annos_whittle_set) > 0: + _logger.warning("Failed to export some differential annotations: %r." % annos_whittle_set) + + def _anno_change(el): + if el.tag in self.diffs: + el.attrib["delta:changed_property"] = "1" + diffs_whittle_set.remove(el.tag) + + #Recall that Element text must be a string + def _append_str(name, value): + if not value is None or name in diffs_whittle_set: + tmpel = ET.Element(name) + if not value is None: + tmpel.text = str(value) + _anno_change(tmpel) + outel.append(tmpel) + + def _append_object(name, value): + if not value is None or name in diffs_whittle_set: + if value is None: + tmpel = ET.Element(name) + else: + tmpel = value.to_Element() + _anno_change(tmpel) + outel.append(tmpel) + + #TODO root should be an element too. Revise schema. + if self.root: + outel.attrib["root"] = str(self.root) + + _append_str("cellpath", self.cellpath) + _append_str("name", self.name) + _append_str("name_type", self.name_type) + _append_str("alloc", self.alloc) + _append_object("mtime", self.mtime) + _append_object("byte_runs", self.byte_runs) + _append_object("original_cellobject", self.original_cellobject) + + if len(diffs_whittle_set) > 0: + _logger.warning("Did not annotate all of the differing properties of this file. Remaining properties: %r." % diffs_whittle_set) + + return outel + + def to_regxml(self): + return _ET_tostring(self.to_Element()) + + @property + def alloc(self): + return self._alloc + + @alloc.setter + def alloc(self, val): + self._alloc = _boolcast(val) + + @property + def annos(self): + """Set of differential annotations. Expected members are the keys of this class's _diff_attr_names dictionary.""" + return self._annos + + @annos.setter + def annos(self, val): + _typecheck(val, set) + self._annos = val + + @property + def byte_runs(self): + return self._byte_runs + + @byte_runs.setter + def byte_runs(self, val): + if not val is None: + _typecheck(val, ByteRuns) + self._byte_runs = val + + @property + def cellpath(self): + return self._cellpath + + @cellpath.setter + def cellpath(self, val): + if not val is None: + _typecheck(val, str) + self._cellpath = val + + @property + def diffs(self): + return self._diffs + + @diffs.setter + def diffs(self, value): + _typecheck(value, set) + self._diffs = value + + @property + def mtime(self): + return self._mtime + + @mtime.setter + def mtime(self, val): + if val is None: + self._mtime = None + elif isinstance(val, TimestampObject): + self._mtime = val + else: + self._mtime = TimestampObject(val, name="mtime") + self.sanity_check() + + @property + def name(self): + return self._name + + @name.setter + def name(self, val): + if not val is None: + _typecheck(val, str) + self._name = val + + @property + def name_type(self): + return self._name_type + + @name_type.setter + def name_type(self, val): + if not val is None: + assert val in ["k", "v"] + self._name_type = val + + @property + def original_cellobject(self): + return self._original_cellobject + + @original_cellobject.setter + def original_cellobject(self, val): + if not val is None: + _typecheck(val, CellObject) + self._original_cellobject = val + + @property + def parent_object(self): + """This object is an extremely sparse CellObject, containing just identifying information. Alternately, it can be an entire object reference to the parent Object, though uniqueness should be checked.""" + return self._parent_object + + @parent_object.setter + def parent_object(self, val): + if not val is None: + _typecheck(val, CellObject) + self._parent_object = val + + @property + def root(self): + return self._root + + @root.setter + def root(self, val): + self._root = _boolcast(val) + + +def iterparse(filename, events=("start","end"), dfxmlobject=None): + """ + Generator. Yields a stream of populated DFXMLObjects, VolumeObjects and FileObjects, paired with an event type ("start" or "end"). The DFXMLObject and VolumeObjects do NOT have their child lists populated with this method - that is left to the calling program. + + The event type interface is meant to match the interface of ElementTree's iterparse; this is simply for familiarity's sake. DFXMLObjects and VolumeObjects are yielded with "start" when the stream of VolumeObject or FileObjects begins - that is, they are yielded after being fully constructed up to the potentially-lengthy child object stream. FileObjects are yielded only with "end". + + @param filename: A string + @param events: Events. Optional. A tuple of strings, containing "start" and/or "end". + @param dfxmlobject: A DFXMLObject document. Optional. A DFXMLObject is created and yielded in the object stream if this argument is not supplied. + """ + + #The DFXML stream file handle. + fh = None + subp = None + subp_command = ["fiwalk", "-x", filename] + if filename.endswith("xml"): + fh = open(filename, "rb") + else: + subp = subprocess.Popen(subp_command, stdout=subprocess.PIPE) + fh = subp.stdout + + _events = set() + for e in events: + if not e in ("start","end"): + raise ValueError("Unexpected event type: %r. Expecting 'start', 'end'." % e) + _events.add(e) + + dobj = dfxmlobject or DFXMLObject() + + #The only way to efficiently populate VolumeObjects is to populate the object when the stream has hit its first FileObject. + vobj = None + + #It doesn't seem ElementTree allows fetching parents of Elements that are incomplete (just hit the "start" event). So, build a volume Element when we've hit "", glomming all elements until the first fileobject is hit. + #Likewise with the Element for the DFXMLObject. + dfxml_proxy = None + volume_proxy = None + + #State machine, used to track when the first fileobject of a volume is encountered. + READING_START = 0 + READING_PRESTREAM = 1 #DFXML metadata, pre-Object stream + READING_VOLUMES = 2 + READING_FILES = 3 + READING_POSTSTREAM = 4 #DFXML metadata, post-Object stream (typically the element) + _state = READING_START + + for (ETevent, elem) in ET.iterparse(fh, events=("start-ns", "start", "end")): + #View the object event stream in debug mode + #_logger.debug("(event, elem) = (%r, %r)" % (ETevent, elem)) + #if ETevent in ("start", "end"): + # _logger.debug("_ET_tostring(elem) = %r" % _ET_tostring(elem)) + + #Track namespaces + if ETevent == "start-ns": + dobj.add_namespace(*elem) + continue + + #Split tag name into namespace and local name + (ns, ln) = _qsplit(elem.tag) + + if ETevent == "start": + if ln == "dfxml": + if _state != READING_START: + raise ValueError("Encountered a element, but the parser isn't in its start state. Recursive declarations aren't supported at this time.") + dfxml_proxy = ET.Element(elem.tag) + for k in elem.attrib: + #Note that xmlns declarations don't appear in elem.attrib. + dfxml_proxy.attrib[k] = elem.attrib[k] + _state = READING_PRESTREAM + elif ln == "volume": + if _state == READING_PRESTREAM: + #Cut; yield DFXMLObject now. + dobj.populate_from_Element(dfxml_proxy) + if "start" in _events: + yield ("start", dobj) + #Start populating a new Volume proxy. + volume_proxy = ET.Element(elem.tag) + for k in elem.attrib: + volume_proxy.attrib[k] = elem.attrib[k] + _state = READING_VOLUMES + elif ln == "fileobject": + if _state == READING_PRESTREAM: + #Cut; yield DFXMLObject now. + dobj.populate_from_Element(dfxml_proxy) + if "start" in _events: + yield ("start", dobj) + elif _state == READING_VOLUMES: + #_logger.debug("Encountered a fileobject while reading volume properties. Yielding volume now.") + #Cut; yield VolumeObject now. + if volume_proxy is not None: + vobj = VolumeObject() + vobj.populate_from_Element(volume_proxy) + if "start" in _events: + yield ("start", vobj) + #Reset + volume_proxy.clear() + volume_proxy = None + _state = READING_FILES + elif ETevent == "end": + if ln == "fileobject": + if _state in (READING_PRESTREAM, READING_POSTSTREAM): + #This particular branch can be reached if there are trailing fileobject elements after the volume element. This would happen if a tool needed to represent files (likely reassembled fragments) found outside all the partitions. + #More frequently, we hit this point when there are no volume groupings. + vobj = None + fi = FileObject() + fi.populate_from_Element(elem) + fi.volume_object = vobj + #_logger.debug("fi = %r" % fi) + if "end" in _events: + yield ("end", fi) + #Reset + elem.clear() + elif elem.tag == "dfxml": + if "end" in _events: + yield ("end", dobj) + elif elem.tag == "volume": + if "end" in _events: + yield ("end", vobj) + _state = READING_POSTSTREAM + elif _state == READING_VOLUMES: + #This is a volume property; glom onto the proxy. + if volume_proxy is not None: + volume_proxy.append(elem) + elif _state == READING_PRESTREAM: + if ln in ["metadata", "creator", "source"]: + #This is a direct child of the DFXML document property; glom onto the proxy. + if dfxml_proxy is not None: + dfxml_proxy.append(elem) + + #If we called Fiwalk, double-check that it exited successfully. + if not subp is None: + _logger.debug("Calling wait() to let the Fiwalk subprocess terminate...") #Just reading from subp.stdout doesn't let the process terminate; it only finishes working. + subp.wait() + if subp.returncode != 0: + e = subprocess.CalledProcessError("There was an error running Fiwalk.") + e.returncode = subp.returncode + e.cmd = subp_command + raise e + _logger.debug("...Done.") + +def parse(filename): + """Returns a DFXMLObject populated from the contents of the (string) filename argument.""" + retval = None + appender = None + for (event, obj) in iterparse(filename): + if event == "start": + if isinstance(obj, DFXMLObject): + retval = obj + appender = obj + elif isinstance(obj, VolumeObject): + retval.append(obj) + appender = obj + elif event == "end": + if isinstance(obj, VolumeObject): + appender = retval + elif isinstance(obj, FileObject): + appender.append(obj) + return retval + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + + logging.basicConfig(level=logging.DEBUG) + #Run unit tests + + assert _intcast(-1) == -1 + assert _intcast("-1") == -1 + assert _qsplit("{http://www.w3.org/2001/XMLSchema}all") == ("http://www.w3.org/2001/XMLSchema","all") + assert _qsplit("http://www.w3.org/2001/XMLSchema}all") == (None, "http://www.w3.org/2001/XMLSchema}all") + + + fi = FileObject() + + #Check property setting + fi.mtime = "1999-12-31T23:59:59Z" + _logger.debug("fi = %r" % fi) + + #Check bad property setting + failed = None + try: + fi.mtime = "Not a timestamp" + failed = False + except: + failed = True + _logger.debug("fi = %r" % fi) + _logger.debug("failed = %r" % failed) + assert failed + + t0 = TimestampObject(prec="100ns", name="mtime") + _logger.debug("t0 = %r" % t0) + assert t0.prec[0] == 100 + assert t0.prec[1] == "ns" + t1 = TimestampObject("2009-01-23T01:23:45Z", prec="2", name="atime") + _logger.debug("t1 = %r" % t1) + assert t1.prec[0] == 2 + assert t1.prec[1] == "s" + + print("Unit tests passed.") diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_ByteRun.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_ByteRun.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_ByteRun.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_ByteRun.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,15 @@ + +import Objects +import copy + +br1 = Objects.ByteRun() +br1.img_offset = 512 +br1.len = 20 + +br2 = copy.deepcopy(br1) + +assert br1 + br2 is None + +br2.img_offset += br1.len + +assert (br1 + br2).len == 40 diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_ByteRuns.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_ByteRuns.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_ByteRuns.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_ByteRuns.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,49 @@ + +import Objects + +br0 = Objects.ByteRun() +br0.img_offset = 0 +br0.len = 20 + +br1 = Objects.ByteRun() +br1.img_offset = 20 +br1.len = 30 + +br2 = Objects.ByteRun() +br2.img_offset = 50 +br2.len = 20 + + +brs_contiguous = Objects.ByteRuns() +brs_contiguous.append(br0) +brs_contiguous.append(br1) +brs_contiguous.append(br2) + +brs_glommed = Objects.ByteRuns() +brs_glommed.glom(br0) +brs_glommed.glom(br1) +brs_glommed.glom(br2) + +brs_discontig = Objects.ByteRuns() +brs_discontig.glom(br0) +brs_discontig.glom(br2) + +brs_backward = Objects.ByteRuns() +brs_backward.glom(br1) +brs_backward.glom(br0) + +assert len(brs_contiguous) == 3 +assert len(brs_glommed) == 1 +assert len(brs_discontig) == 2 +assert len(brs_backward) == 2 + +assert brs_glommed[0].len == 70 +assert brs_backward[0].len == 30 +assert brs_backward[1].len == 20 + +br_facet_data = Objects.ByteRuns(facet="data") +br_facet_name = Objects.ByteRuns(facet="name") +br_facet_default = Objects.ByteRuns() +assert br_facet_data == br_facet_default +assert br_facet_name != br_facet_data +assert br_facet_name != br_facet_default diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_cat_partitions.sh tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_cat_partitions.sh --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_cat_partitions.sh 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_cat_partitions.sh 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,8 @@ +#!/bin/bash + +. ../_pick_pythons.sh + +"$PYTHON3" ../cat_partitions.py \ + 12345678:../../samples/difference_test_0.xml \ + 87654321:../../samples/difference_test_1.xml \ + | xmllint --format - >$0.dfxml diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_differential_dfxml.sh tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_differential_dfxml.sh --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_differential_dfxml.sh 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_differential_dfxml.sh 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,11 @@ +#!/bin/bash + +source ../_pick_pythons.sh + +"$PYTHON3" ../make_differential_dfxml.py -d ../../samples/difference_test_{0,1}.xml | xmllint --format - > differential_dfxml_test_01.xml + +"$PYTHON3" ../summarize_differential_dfxml.py -d ../differential_dfxml_test_01.xml > differential_dfxml_test_01.txt + +"$PYTHON3" ../make_differential_dfxml.py -d ../../samples/difference_test_{2,3}.xml | xmllint --format - > differential_dfxml_test_23.xml + +"$PYTHON3" ../summarize_differential_dfxml.py -d differential_dfxml_test_23.xml > differential_dfxml_test_23.txt diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_ByteRuns.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_ByteRuns.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_ByteRuns.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_ByteRuns.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,23 @@ +import Objects +import copy +import logging +import os + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +br = Objects.ByteRun() +br.file_offset = 4128 +br.len = 133 +brs = Objects.ByteRuns() +brs.append(br) + +cbrs = copy.deepcopy(brs) + +_logger.debug("brs = %r." % brs) +_logger.debug("cbrs = %r." % cbrs) +assert cbrs == brs + +cbrs[0].file_offset += 133 +_logger.debug("cbrs = %r." % cbrs) +assert cbrs != brs diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_CellObject.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_CellObject.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_CellObject.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_CellObject.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,45 @@ + +import Objects +import logging +import os + +import test_diffing_ByteRuns + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +co = Objects.CellObject() +_logger.debug("co = %r" % co) +_logger.debug("co.to_regxml() = %r" % co.to_regxml()) + +co.root = 1 +co.cellpath = "\\Deleted_root" +co.name = "Deleted_root" +co.name_type = "k" +co.alloc = 1 +co.mtime = "2009-01-23T01:23:45Z" +co.mtime.prec = "100ns" +co.byte_runs = test_diffing_ByteRuns.brs +_logger.debug("co = %r" % co) +_logger.debug("co.to_regxml() = %r" % co.to_regxml()) + +#Make an Element +coe = co.to_Element() + +#Clone +nco = Objects.CellObject() +nco.populate_from_Element(coe) +_logger.debug("nco.to_regxml() = %r" % nco.to_regxml()) +diffs = co.compare_to_other(nco) +_logger.debug("diffs = %r" % diffs) +assert co == nco + +#Modify +nco.name = "(Doubled)" +nco.root = False +nco.original_cellobject = co +nco.compare_to_original() +_logger.debug("nco.to_regxml() = %r" % nco.to_regxml()) + +_logger.debug("nco.diffs = %r" % nco.diffs) +assert nco.diffs == set(["name", "root"]) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_FileObject.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_FileObject.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_FileObject.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_FileObject.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,80 @@ + +import Objects +import logging +import os + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +f0 = Objects.FileObject() + +fo = Objects.FileObject() +pfo = Objects.FileObject() +pfo.inode = 234 +f0.parent_object = pfo +f0.filename = "test file" +f0.error = "Neither a real file, nor real error" +f0.partition = 2 +f0.id = 235 +f0.name_type = "r" +f0.filesize = 1234 +f0.unalloc = 0 +f0.unused = 0 +f0.orphan = 0 +f0.compressed = 1 +f0.inode = 6543 +f0.libmagic = "data" +f0.meta_type = 8 +f0.mode = 755 +f0.nlink = 1 +f0.uid = "S-1-234-etc" +f0.gid = "S-2-234-etc" +f0.mtime = "1999-12-31T12:34:56Z" +f0.ctime = "1998-12-31T12:34:56Z" +f0.atime = "1997-12-31T12:34:56Z" +f0.crtime = "1996-12-31T12:34:56Z" +f0.seq = 3 +f0.dtime = "1995-12-31T12:34:56Z" +f0.bkup_time = "1994-12-31T12:34:56Z" +f0.link_target = "Nonexistent file" +f0.libmagic = "Some kind of compressed" +f0.sha1 = "7d97e98f8af710c7e7fe703abc8f639e0ee507c4" +f0.md5 = "2b00042f7481c7b056c4b410d28f33cf" +#fo.brs = brs #TODO +_logger.debug("f0 = %r" % f0) +_logger.debug("f0.to_dfxml() = %r" % f0.to_dfxml()) + +e0 = f0.to_Element() +_logger.debug("e0 = %r" % e0) + +#f1 = eval(repr(f0)) #TODO The recursive evals cause namespace confusion (Objects.foo); replace the next two lines when that's settled. +f1 = Objects.FileObject() +f1.populate_from_Element(e0) + +f2 = Objects.FileObject() +f2.populate_from_Element(e0) + +#The id property should not be included in the comparisons +f1.id = 111 +f1.alloc = False + +f2.mtime = "2999-12-31T12:34:56Z" +f2.sha1 = "447d306060631570b7713ea48e74103c68eab0a3" +f2.md5 = "b9eb9d6228842aeb05d64f30d56b361e" + +_logger.debug("f1 = %r" % f1) +d01 = f0.compare_to_other(f1) +_logger.debug("d01 = %r" % d01) +assert d01 == set(["alloc"]) or d01 == set(["alloc", "unalloc"]) + +d02 = f0.compare_to_other(f2) + +_logger.debug("d02 = %r" % d02) +assert d02 == set(["mtime", "md5", "sha1"]) + +f2.original_fileobject = f0 +f2.compare_to_original() +_logger.debug("f2.diffs = %r" % f2.diffs) +assert f2.diffs == d02 + +#TODO include byte_runs diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_HiveObject.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_HiveObject.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_HiveObject.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_HiveObject.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,5 @@ + +import Objects + +ho = Objects.HiveObject() +#TODO Not enough properties are encoded for hive diffing to be tested diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_TimestampObject.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_TimestampObject.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_TimestampObject.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_TimestampObject.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,26 @@ + +import Objects +import logging +import os +import copy + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +t0 = Objects.TimestampObject() +t0.name = "mtime" +t0.prec = "2s" + +t1 = copy.deepcopy(t0) + +assert t0 == t1 + +t0e = t0.to_Element() +t2 = Objects.TimestampObject() +t2.populate_from_Element(t0e) + +assert t0 == t2 + +t2.prec = "100" + +assert t0 != t2 diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_VolumeObject.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_VolumeObject.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_diffing_VolumeObject.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_diffing_VolumeObject.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,41 @@ + +import Objects +import logging +import os + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +v0 = Objects.VolumeObject() + +v0.sector_size = 512 +v0.block_size = 4096 +v0.partition_offset = 32256 +v0.ftype = -1 +assert v0.ftype == -1 +v0.ftype_str = 1 +v0.block_count = 100000 +v0.allocated_only = False +v0.first_block = 0 +v0.last_block = v0.block_count + +_logger.debug(repr(v0)) +v1 = eval("Objects." + repr(v0)) + +e0 = v0.to_Element() +_logger.debug("e0 = %r" % e0) + +v2 = Objects.VolumeObject() +v2.populate_from_Element(e0) + +v1.block_size = 512 +v2.partition_offset = v0.partition_offset + v0.block_count*v0.block_size + +d01 = v0.compare_to_other(v1) +d02 = v0.compare_to_other(v2) + +_logger.debug("d01 = %r" % d01) +assert d01 == set(["block_size"]) + +_logger.debug("d02 = %r" % d02) +assert d02 == set(["partition_offset"]) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_FileObject_byte_run_facets.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_FileObject_byte_run_facets.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_FileObject_byte_run_facets.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_FileObject_byte_run_facets.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,46 @@ + +import Objects +import logging +import os +import xml.etree.ElementTree as ET + +_logger = logging.getLogger(os.path.basename(__file__)) +logging.basicConfig(level=logging.DEBUG) + +br1 = Objects.ByteRun(img_offset=1, len=1) +br2 = Objects.ByteRun(img_offset=2, len=2) +br3 = Objects.ByteRun(img_offset=4, len=3) + +dbr = Objects.ByteRuns() +ibr = Objects.ByteRuns() +nbr = Objects.ByteRuns() + +dbr.append(br1) +ibr.append(br2) +nbr.append(br3) + +dbr.facet = "data" +ibr.facet = "inode" +nbr.facet = "name" + +f1 = Objects.FileObject() +f1.data_brs = dbr +f1.inode_brs = ibr +f1.name_brs = nbr + +assert f1.data_brs[0].img_offset == 1 +assert f1.inode_brs[0].img_offset == 2 +assert f1.name_brs[0].img_offset == 4 + +e1 = f1.to_Element() +#_logger.debug(f1) +#_logger.debug(ET.tostring(e1)) + +f2 = Objects.FileObject() + +f2.populate_from_Element(e1) +#_logger.debug(f2) + +assert f2.data_brs[0].img_offset == 1 +assert f2.inode_brs[0].img_offset == 2 +assert f2.name_brs[0].img_offset == 4 diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_FileObject_from_stat.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_FileObject_from_stat.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_FileObject_from_stat.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_FileObject_from_stat.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,11 @@ + +import Objects +import logging +import os + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +f0 = Objects.FileObject() +f0.populate_from_stat(os.stat(__file__)) +_logger.debug("f0.to_dfxml() = %r" % f0.to_dfxml()) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_RegXMLObject.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_RegXMLObject.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_RegXMLObject.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_RegXMLObject.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,11 @@ + +import Objects +import test_diffing_CellObject +import test_diffing_HiveObject + +ro = Objects.RegXMLObject(version="2.0") +ho = Objects.HiveObject() +ho.append(test_diffing_CellObject.co) +ho.append(test_diffing_CellObject.nco) +ro.append(test_diffing_HiveObject.ho) +ro.print_regxml() diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_VolumeObject_hash.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_VolumeObject_hash.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/test_VolumeObject_hash.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/test_VolumeObject_hash.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import Objects +import os +import logging + +logging.basicConfig(level=logging.DEBUG) +_logger = logging.getLogger(os.path.basename(__file__)) + +s0 = set() + +v0 = Objects.VolumeObject() +v1 = Objects.VolumeObject() + +s0.add(v0) +s0.add(v1) + +_logger.debug("len(s0) = %r" % len(s0)) +assert len(s0) == 2 + +f0 = Objects.FileObject() +f1 = Objects.FileObject() +f0.volume_object = v0 +f1.volume_object = v0 + +s1 = set() +s1.add(f0.volume_object) +s1.add(f1.volume_object) +_logger.debug("len(s1) = %r" % len(s1)) +assert len(s1) == 1 diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_01.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_01.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_01.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_01.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import sys +import os +import logging +import argparse + +sys.path.append("..") +import Objects +import make_differential_dfxml + +_logger = logging.getLogger(os.path.basename(__file__)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--debug", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + thisdir = os.path.dirname(__file__) + tempxml1_path = __file__ + "-test1.xml" + tempxml2_path = __file__ + "-test2.xml" + + _logger.info("Building iteration: 0.") + d_in_memory = make_differential_dfxml.make_differential_dfxml( + os.path.join(thisdir, "../../samples/difference_test_0.xml"), + os.path.join(thisdir, "../../samples/difference_test_1.xml") + ) + + #Write and read the DFXML stream a couple times to ensure consistent serialization and deserialization + with open(tempxml1_path, "w") as fh: + d_in_memory.print_dfxml(output_fh=fh) + _logger.info("Building iteration: 1.") + d_from_disk = Objects.parse(tempxml1_path) + with open(tempxml2_path, "w") as fh: + d_from_disk.print_dfxml(output_fh=fh) + _logger.info("Building iteration: 2.") + d_from_disk_again = Objects.parse(tempxml2_path) + + for (iteration, d) in enumerate((d_in_memory, d_from_disk, d_from_disk_again)): + _logger.info("Checking iteration: %d." % iteration) + for o in d: + #_logger.debug(repr(o)) + if isinstance(o, Objects.FileObject): + if "deleted" in o.annos: + _name = o.original_fileobject.filename + else: + _name = o.filename + expected_fileobject_diffs = { + ("i_am_new.txt"): set([]), + ("i_will_be_deleted.txt"): set([]), + ("i_will_be_modified.txt"): set(["filesize","mtime","ctime","atime","data_brs","md5","sha1"]), + ("i_will_be_accessed.txt"): set(["atime", "data_brs"]) + } + if o.diffs != expected_fileobject_diffs[_name]: + _logger.info("FAILED: %r." % _name) + _logger.info("Expected diffs: %r;" % expected_fileobject_diffs[_name]) + _logger.info("Received diffs: %r." % o.diffs) + assert False + _logger.info("PASSED: %r." % _name) + + #TODO Once the old idifference.py is retired, remove the Python3-only bit. + if sys.version_info >= (3,1): + import summarize_differential_dfxml + for sortby in "times", "path": + _logger.info("Summarizing, sorting by %s." % sortby) + summarize_differential_dfxml.report(d_from_disk_again, sort_by=sortby) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_23.py tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_23.py --- tcpflow-1.4.4+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_23.py 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/python/test_Objects/verify_differential_dfxml_23.py 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import sys +import os +import logging + +sys.path.append("..") +import Objects +import make_differential_dfxml + +_logger = logging.getLogger(os.path.basename(__file__)) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + thisdir = os.path.dirname(__file__) + tempxml1_path = __file__ + "-test1.xml" + tempxml2_path = __file__ + "-test2.xml" + d_in_memory = make_differential_dfxml.make_differential_dfxml( + os.path.join(thisdir, "../../samples/difference_test_2.xml"), + os.path.join(thisdir, "../../samples/difference_test_3.xml") + ) + + #Write and read the DFXML stream a couple times to ensure consistent serialization and deserialization + with open(tempxml1_path, "w") as fh: + d_in_memory.print_dfxml(output_fh=fh) + d_from_disk = Objects.parse(tempxml1_path) + with open(tempxml2_path, "w") as fh: + d_from_disk.print_dfxml(output_fh=fh) + d_from_disk_again = Objects.parse(tempxml2_path) + + for d in (d_in_memory, d_from_disk, d_from_disk_again): + for o in d: + _logger.debug(repr(o)) + if isinstance(o, Objects.VolumeObject): + expected_partition_annos = { + (1048576,"FAT16"): set(["deleted"]), + (1073741824,"FAT32"): set([]), + (2147483648,"FAT32"): set(["deleted"]), + (2147483648,"NTFS"): set(["new"]), + (4294967296,"FAT32"): set(["new"]) + } + if o.annos != expected_partition_annos[(o.partition_offset, o.ftype_str)]: + _logger.info("Partition offset: %r;" % o.partition_offset) + _logger.info("Partition ftype_str: %r;" % o.ftype_str) + _logger.info("Expected: %r;" % expected_partition_annos[o.partition_offset]) + _logger.info("Received: %r." % o.annos) + _logger.info("Diffs: %r." % o.diffs) + assert False + else: + #FileObjects + pass #TODO diff -Nru tcpflow-1.4.4+repack1/src/dfxml/samples/difference_test_2.xml tcpflow-1.4.5+repack1/src/dfxml/samples/difference_test_2.xml --- tcpflow-1.4.4+repack1/src/dfxml/samples/difference_test_2.xml 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/samples/difference_test_2.xml 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,171 @@ + + + + vi + 7.3 + + GCC 4.6 + + null + vi diffee_time0.xml + + 512 + + 1048576 + 4096 + FAT16 + + CHANGE___move_from_P1M_to_P3G + 1 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 3 + 0000000000000000000000000000000000000000 + + + CHANGE___move_from_P1M_to_P3G___change_name + 1 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 4 + 1000000000000000000000000000000000000000 + + + CHANGE___move_from_P1M_to_P3G___change_content___change_mtime + 1 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 5 + 2000000000000000000000000000000000000000 + + + + 1073741824 + 4096 + FAT32 + + NO_CHANGE + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 3 + 3000000000000000000000000000000000000000 + + + CHANGE___erased + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 4 + 4000000000000000000000000000000000000000 + + + CHANGE___move_from_P1G_to_P2G + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 5 + 5000000000000000000000000000000000000000 + + + CHANGE___timestamp_changes_format_only + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 6 + 6000000000000000000000000000000000000000 + + + CHANGE___erased___replaced_by_sibling + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 7 + A000000000000000000000000000000000000000 + + + CHANGE___renamed_to_erased_sibling___change_checksum_and_mtime + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 8 + B000000000000000000000000000000000000000 + + + CHANGE___erased___replaced_by_other_partition_file + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 9 + C000000000000000000000000000000000000000 + + + CHANGE___renamed + 2 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 9 + E000000000000000000000000000000000000000 + + + + 2147483648 + 4096 + FAT32 + + CHANGE___content_and_mtime + 3 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 3 + 7000000000000000000000000000000000000000 + + + CHANGE___unallocated + 3 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 4 + 8000000000000000000000000000000000000000 + + + CHANGE___moved_to_erased_P1G_file + 3 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 5 + D000000000000000000000000000000000000000 + + + diff -Nru tcpflow-1.4.4+repack1/src/dfxml/samples/difference_test_3.xml tcpflow-1.4.5+repack1/src/dfxml/samples/difference_test_3.xml --- tcpflow-1.4.4+repack1/src/dfxml/samples/difference_test_3.xml 1970-01-01 00:00:00.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/samples/difference_test_3.xml 2015-08-26 03:35:59.000000000 +0000 @@ -0,0 +1,162 @@ + + + + vi + 7.3 + + GCC 4.6 + + null + vi diffee_time1.xml + + 512 + + + 1073741824 + 4096 + FAT32 + + NO_CHANGE + 1 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 3 + 3000000000000000000000000000000000000000 + + + CHANGE___timestamp_changes_format_only + 1 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T05:34:56-0700 + 6 + 6000000000000000000000000000000000000000 + + + CHANGE___erased___replaced_by_sibling + 1 + 4098 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 8 + B100000000000000000000000000000000000000 + + + CHANGE___erased___replaced_by_other_partition_file + 1 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:35:00Z + 10 + D000000000000000000000000000000000000000 + + + _CHANGE___renamed + 1 + 4097 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 9 + E000000000000000000000000000000000000000 + + + + 2147483648 + 4096 + NTFS + + CHANGE___move_from_P1G_to_P2G + 2 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 5 + 5000000000000000000000000000000000000000 + + + CHANGE___content_and_mtime + 2 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:58Z + 3 + 7100000000000000000000000000000000000000 + + + CHANGE___unallocated + 2 + 4097 + 0 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 4 + 8000000000000000000000000000000000000000 + + + CHANGE___new_file + 2 + 4097 + 1 + 2007-08-09T12:34:59Z + 2007-08-09T12:34:59Z + 2007-08-09T12:34:59Z + 6 + 9100000000000000000000000000000000000000 + + + + + 4294967296 + 4096 + FAT32 + + CHANGE___move_from_P1M_to_P3G + 3 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 3 + 0000000000000000000000000000000000000000 + + + CHANGE___move_from_P1M_to_P3G___change_content___change_mtime + 3 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:57Z + 4 + 2100000000000000000000000000000000000000 + + + _CHANGE___move_from_P1M_to_P3G___change_name + 3 + 4097 + 1 + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 2007-08-09T12:34:56Z + 5 + 1000000000000000000000000000000000000000 + + + diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/bootstrap.sh tcpflow-1.4.5+repack1/src/dfxml/src/bootstrap.sh --- tcpflow-1.4.4+repack1/src/dfxml/src/bootstrap.sh 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/bootstrap.sh 2015-08-26 03:35:59.000000000 +0000 @@ -1,5 +1,8 @@ #!/bin/sh # have automake do an initial population iff necessary +# +# this file is public domain +# if [ ! -e config.guess -o ! -e config.sub -o ! -e install-sh -o ! -e missing ]; then autoheader -f touch NEWS README AUTHORS ChangeLog diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/configure.ac tcpflow-1.4.5+repack1/src/dfxml/src/configure.ac --- tcpflow-1.4.4+repack1/src/dfxml/src/configure.ac 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/configure.ac 2015-08-26 03:35:59.000000000 +0000 @@ -4,6 +4,10 @@ # See http://autotoolset.sourceforge.net/tutorial.html # and http://www.openismus.com/documents/linux/automake/automake.shtml +# +# This file is public domain +# + AC_PREREQ(2.57) AC_INIT(DFXML, 0.0.1, bugs@afflib.org) AC_CONFIG_MACRO_DIR(m4) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/COPYING tcpflow-1.4.5+repack1/src/dfxml/src/COPYING --- tcpflow-1.4.4+repack1/src/dfxml/src/COPYING 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/COPYING 2015-08-26 03:35:59.000000000 +0000 @@ -1,4 +1,9 @@ -This software is a work of the Naval Postgraduate School (NPS) and the National Institute of Standards and Technology (NIST) . As a work of the US Government this work is not subject to copyright law. Neither NPS nor NIST assumes any responsibility whatsoever for its use by other parties, and makes no guarantees, expressed or implied, about its quality, reliability, or any other characteristic. +This software is a work of the Naval Postgraduate School (NPS) and the +National Institute of Standards and Technology (NIST) . As a work of +the US Government this work is not subject to copyright law. Neither +NPS nor NIST assumes any responsibility whatsoever for its use by +other parties, and makes no guarantees, expressed or implied, about +its quality, reliability, or any other characteristic. Contributions by non-US government entities herein are covered under the LGPL, which is included below. diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/cppmutex.h tcpflow-1.4.5+repack1/src/dfxml/src/cppmutex.h --- tcpflow-1.4.4+repack1/src/dfxml/src/cppmutex.h 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/cppmutex.h 2015-08-26 03:35:59.000000000 +0000 @@ -6,6 +6,11 @@ * Create a cppmutex::lock(M) object to get a lock; delete the object to free it. * * BE SURE THAT HAVE_PTHREAD IS DEFINED BEFORE INCLUDING THIS FILE + * + * Revision History: + * 2012 - Simson L. Garfinkel - Created for bulk_extractor. + * + * This file is public domain */ diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/cpu_info.c tcpflow-1.4.5+repack1/src/dfxml/src/cpu_info.c --- tcpflow-1.4.4+repack1/src/dfxml/src/cpu_info.c 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/cpu_info.c 2015-08-26 03:35:59.000000000 +0000 @@ -1,7 +1,15 @@ +/* + * Revision History: + * 2012 - Simson L. Garfinkel - Created for bulk_extractor. + * + * Test program for cpuid program. + */ + #include #include #include + #define cpuid(id) __asm__( "cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(id), "b"(0), "c"(0), "d"(0)) #define b(val, base, end) ((val << (__WORDSIZE-end-1)) >> (__WORDSIZE-end+base-1)) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/cpu_stat.cpp tcpflow-1.4.5+repack1/src/dfxml/src/cpu_stat.cpp --- tcpflow-1.4.4+repack1/src/dfxml/src/cpu_stat.cpp 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/cpu_stat.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -1,3 +1,8 @@ +/* + * Test program for cpustat instruction. + */ + + /** cpustat.h -- Header for cpustat.cpp. * Copyright (c) 2004 Brad Fish (brad.fish@gmail.com). */ diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_configure.m4 tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_configure.m4 --- tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_configure.m4 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_configure.m4 2015-08-26 03:35:59.000000000 +0000 @@ -2,9 +2,14 @@ # mix-ins for dfxml # Support for hash_t as well. # +# This file is public domain +# Revision History: +# 2012 - Simson Garfinkel - Created for bulk_extractor +# AC_MSG_NOTICE([Including dfxml_configure.m4 from dfxml]) -AC_CHECK_HEADERS([afflib/afflib.h err.h expat.h libewf.h pwd.h sys/cdefs.h sys/mman.h sys/resource.h sys/utsname.h unistd.h ]) +AC_MSG_NOTICE([Note: checks for afflib/afflib.h and libewf.h should be in the caller, so they can be disabled]) +AC_CHECK_HEADERS([err.h expat.h pwd.h sys/cdefs.h sys/mman.h sys/resource.h sys/utsname.h unistd.h winsock2.h ]) AC_CHECK_FUNCS([fork localtime_r getuid gethostname getwpuid getrusage mkstemp vasprintf regcomp ]) AC_LANG_PUSH(C++) @@ -39,6 +44,12 @@ AC_DEFINE([DFXML_GNUC_HAS_DIAGNOSTIC_PRAGMA],[1],[GCC supports #pragma GCC diagnostic]), ) +################################################################ +## on Win32, crypto requires zlib +case $host in + *mingw32*) + AC_CHECK_LIB([z], [gzdopen],[LIBS="-lz $LIBS"], [AC_MSG_ERROR([Could not find zlib library])]) +esac ################################################################ ## OpenSSL Support is now required (for hash_t) diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_demo.cpp tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_demo.cpp --- tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_demo.cpp 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_demo.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -11,6 +11,15 @@ #include "config.h" #include "dfxml_reader.h" +/* + * DFXML demo program. + * + * Simson L. Garfinkel + * Created for bulk_extractor. + * This file is public domain. + */ + + void process(dfxml::file_object &fi) { std::cout << "fi.filename: " << fi.filename() << "\n"; diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_reader.cpp tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_reader.cpp --- tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_reader.cpp 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_reader.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -1,3 +1,12 @@ +/* + * DFXML reader in C++ using SAX. + * Revision History: + * 2012 - SImson L. Garfinkel - Developed as test program. + * + * This file is public domain. + */ + + #include #include #include @@ -7,11 +16,15 @@ #include #include #include + + + /* We need netinet/in.h or windowsx.h */ #ifdef HAVE_NETINET_IN_H # include #endif -#ifdef WIN32 + +#ifdef HAVE_WINSOCK2_H # include # include # include diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_reader.h tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_reader.h --- tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_reader.h 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_reader.h 2015-08-26 03:35:59.000000000 +0000 @@ -11,6 +11,12 @@ ** and submit your patch as a pull request on github. **/ +/* + * Revision History: + * 2012 - Simson L. Garfinkel - Developed as test program. + * + * This file is public domain. + */ diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_writer.cpp tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_writer.cpp --- tcpflow-1.4.4+repack1/src/dfxml/src/dfxml_writer.cpp 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/dfxml_writer.cpp 2015-08-26 03:35:59.000000000 +0000 @@ -20,7 +20,7 @@ #include "config.h" -#ifdef WIN32 +#ifdef HAVE_WINSOCK2_H #include #endif @@ -335,7 +335,7 @@ int size = vsnprintf(buf,sizeof(buf),fmt,ap); if(size<0) return size; /* Now allocate the memory */ - *ret = (char *)strdup(buf); + *ret = strcpy((char *) malloc(strlen(buf)+1), buf); return size; } } @@ -652,12 +652,8 @@ #include #endif -#ifdef HAVE_ZMQ_H -#include -#endif - #ifdef HAVE_AFFLIB_AFFLIB_H -#pragma GCC diagnostic ignored "-Wreserved-user-defined-literal" // required for C11 +//#pragma GCC diagnostic ignored "-Wreserved-user-defined-literal" // required for C11 #include #endif @@ -723,15 +719,6 @@ #ifdef SQLITE_VERSION xmlout("library", "", "name=\"sqlite\" version=\"" SQLITE_VERSION "\" source_id=\"" SQLITE_SOURCE_ID "\"",false); #endif -#ifdef HAVE_ZMQ_VERSION - { - int zmq_major, zmq_minor, zmq_patch; - zmq_version (&zmq_major, &zmq_minor, &zmq_patch); - stringstream zmq_ss; - zmq_ss << zmq_major << "." << zmq_minor << "." << zmq_patch; - xmlout("library", "", std::string("name=\"zmq\" version=\"") + zmq_ss.str() + "\"",false); - } -#endif #ifdef HAVE_GNUEXIF // gnuexif does not have a programmatically obtainable version. xmlout("library","","name=\"gnuexif\" version=\"?\"",false); diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/hash_t.h tcpflow-1.4.5+repack1/src/dfxml/src/hash_t.h --- tcpflow-1.4.4+repack1/src/dfxml/src/hash_t.h 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/hash_t.h 2015-08-26 03:35:59.000000000 +0000 @@ -6,7 +6,7 @@ * * Generating a hash: * sha1_t val = sha1_generator::hash_buf(buf,bufsize) - * sha1_1 generator hasher; + * sha1_t generator hasher; * hasher.update(buf,bufsize) * hasher.update(buf,bufsize) * hasher.update(buf,bufsize) @@ -20,9 +20,14 @@ * * This can be updated in the future for Mac so that the hash__ class * is then subclassed by a hash__openssl or a hash__commonCrypto class. + * + * + * Revision History: + * 2012 - Simson L. Garfinkel - Created for bulk_extractor. + * + * This file is public domain */ - #ifndef HASH_T_H #define HASH_T_H @@ -161,6 +166,27 @@ typedef hash__ sha512_t; #endif +template +inline std::string digest_name(); +template<> +inline std::string digest_name() { + return "MD5"; +} +template<> +inline std::string digest_name() { + return "SHA1"; +} +template<> +inline std::string digest_name() { + return "SHA256"; +} +#ifdef HAVE_EVP_SHA512 +template<> +inline std::string digest_name() { + return "SHA512"; +} +#endif + template class hash_generator__ { /* generates the hash */ EVP_MD_CTX mdctx; /* the context for computing the value */ diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/Makefile.am tcpflow-1.4.5+repack1/src/dfxml/src/Makefile.am --- tcpflow-1.4.4+repack1/src/dfxml/src/Makefile.am 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/Makefile.am 2015-08-26 03:35:59.000000000 +0000 @@ -1,3 +1,5 @@ +# This file is in the public domain + bin_PROGRAMS = dfxml_demo iblkfind dfxml_demo_SOURCES = dfxml_demo.cpp dfxml_reader.cpp dfxml_reader.h iblkfind_SOURCES = iblkfind.cpp dfxml_reader.cpp dfxml_reader.h diff -Nru tcpflow-1.4.4+repack1/src/dfxml/src/README tcpflow-1.4.5+repack1/src/dfxml/src/README --- tcpflow-1.4.4+repack1/src/dfxml/src/README 2014-01-10 05:19:24.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/dfxml/src/README 2015-08-26 03:35:59.000000000 +0000 @@ -1 +1,9 @@ -$ git checkout -b tmp ; git checkout master ; git merge tmp ; git branch -d tmp ; git push git@github.com:simsong/dfxml.git master +This file is public domain. + +Sometimes people make changes to this file when it is a sub-module. If +you do so, the following incantation will create a branch called +'tmp', checkout the master, merge the tmp into the master, delete the +tmp branch, and push the results back to the origin: + +git checkout -b tmp ; git checkout master ; git merge tmp ; +git branch -d tmp ; git push git@github.com:simsong/dfxml.git master diff -Nru tcpflow-1.4.4+repack1/src/Makefile.am tcpflow-1.4.5+repack1/src/Makefile.am --- tcpflow-1.4.4+repack1/src/Makefile.am 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/Makefile.am 2015-08-26 03:34:50.000000000 +0000 @@ -1,7 +1,15 @@ # Programs that we compile: bin_PROGRAMS = tcpflow -AM_CPPFLAGS = -I${top_srcdir}/src/be13_api -I${top_srcdir}/src/wifipcap +if WIFI_ENABLED +WIFI_INCS = -I${top_srcdir}/src/wifipcap +else +WIFI_INCS = +endif + +AM_CPPFLAGS = -I${top_srcdir}/src/be13_api $(WIFI_INCS) + +CONFIG_CLEAN_FILES = config.h # old location of config.h include dfxml/src/Makefile.defs include be13_api/Makefile.defs @@ -35,6 +43,7 @@ WIFI = datalink_wifi.cpp \ datalink_wifi.h \ + scan_wifiviz.cpp \ wifipcap/TimeVal.cpp \ wifipcap/TimeVal.h \ wifipcap/arp.h \ @@ -58,11 +67,16 @@ wifipcap/udp.h \ wifipcap/util.h \ wifipcap/wifipcap.cpp \ - wifipcap/wifipcap.h + wifipcap/wifipcap.h +if WIFI_ENABLED +WIFI_FILES = $(WIFI) +else +WIFI_FILES = +endif tcpflow_SOURCES = \ - $(DFXML_WRITER) $(NETVIZ) $(BE13_API) $(WIFI) \ + $(DFXML_WRITER) $(NETVIZ) $(BE13_API) $(WIFI_FILES) \ datalink.cpp flow.cpp \ tcpflow.cpp \ tcpip.h tcpip.cpp \ @@ -72,7 +86,6 @@ scan_http.cpp \ scan_tcpdemux.cpp \ scan_netviz.cpp \ - scan_wifiviz.cpp \ pcap_writer.h \ iptree.h \ http-parser/http_parser.c \ diff -Nru tcpflow-1.4.4+repack1/src/netviz/one_page_report.cpp tcpflow-1.4.5+repack1/src/netviz/one_page_report.cpp --- tcpflow-1.4.4+repack1/src/netviz/one_page_report.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/netviz/one_page_report.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -351,10 +351,12 @@ //// date range time_t tstart = report.earliest.tv_sec; struct tm start; + memset(&start,0,sizeof(start)); localtime_r(&tstart,&start); time_t tstop = report.latest.tv_sec; struct tm stop; + memset(&stop,0,sizeof(stop)); localtime_r(&tstop,&stop); formatted = ssprintf("Date range: %04d-%02d-%02d %02d:%02d:%02d -- %04d-%02d-%02d %02d:%02d:%02d", 1900 + start.tm_year, 1 + start.tm_mon, start.tm_mday, diff -Nru tcpflow-1.4.4+repack1/src/netviz/plot_view.h tcpflow-1.4.5+repack1/src/netviz/plot_view.h --- tcpflow-1.4.4+repack1/src/netviz/plot_view.h 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/netviz/plot_view.h 2015-08-26 03:34:50.000000000 +0000 @@ -27,6 +27,11 @@ #include #include +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + + class plot_view { public: plot_view() : diff -Nru tcpflow-1.4.4+repack1/src/scan_netviz.cpp tcpflow-1.4.5+repack1/src/scan_netviz.cpp --- tcpflow-1.4.4+repack1/src/scan_netviz.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/scan_netviz.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -32,7 +32,9 @@ #endif +#ifdef HAVE_LIBCAIRO static int histogram_dump = 0; +#endif extern "C" void scan_netviz(const class scanner_params &sp,const recursion_control_block &rcb) @@ -46,7 +48,7 @@ if(sp.phase==scanner_params::PHASE_STARTUP){ sp.info->name = "netviz"; - sp.info->flags = scanner_info::SCANNER_DISABLED; + sp.info->flags = scanner_info::SCANNER_DISABLED; // disabled by default sp.info->author= "Mike Shick"; sp.info->packet_user = 0; #ifdef HAVE_LIBCAIRO diff -Nru tcpflow-1.4.4+repack1/src/tcpdemux.cpp tcpflow-1.4.5+repack1/src/tcpdemux.cpp --- tcpflow-1.4.4+repack1/src/tcpdemux.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/tcpdemux.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -125,7 +125,8 @@ int tcpdemux::retrying_open(const std::string &filename,int oflag,int mask) { while(true){ - if(open_flows.size() >= max_fds) close_oldest_fd(); + //Packet index file reduces max_fds by 1/2 as the index files also take a fd + if(open_flows.size() >= (opt.output_packet_index ? max_fds/2 : max_fds)) close_oldest_fd(); int fd = ::open(filename.c_str(),oflag,mask); DEBUG(2)("retrying_open ::open(fn=%s,oflag=x%x,mask:x%x)=%d",filename.c_str(),oflag,mask,fd); if(fd>=0){ @@ -193,6 +194,10 @@ * Flows are post-processed when a FIN is received and all bytes are received. * If a FIN is received and bytes are outstanding, they are post-processed when the last byte is received. * When the program shut down, all open flows are post-processed. + * + * Amended to trigger the packet/data location index sort as part of the post-processing. This sorts + * the (potentially out of order) index to make it simple for external applications. No processing is + * done if the (-I) index generation feature is turned off. --GDD */ void tcpdemux::post_process(tcpip *tcp) @@ -377,6 +382,7 @@ bool syn_set = FLAG_SET(tcp_header->th_flags, TH_SYN); bool ack_set = FLAG_SET(tcp_header->th_flags, TH_ACK); bool fin_set = FLAG_SET(tcp_header->th_flags, TH_FIN); + bool rst_set = FLAG_SET(tcp_header->th_flags, TH_RST); /* calculate the total length of the TCP header including options */ u_int tcp_header_len = tcp_header->th_off * 4; @@ -393,8 +399,8 @@ int32_t delta = 0; // from current position in tcp connection; must be SIGNED 32 bit! tcpip *tcp = find_tcpip(this_flow); - DEBUG(60)("%s%s%s tcp_header_len=%d tcp_datalen=%d seq=%u tcp=%p", - (syn_set?"SYN ":""),(ack_set?"ACK ":""),(fin_set?"FIN ":""),(int)tcp_header_len,(int)tcp_datalen,(int)seq,tcp); + DEBUG(60)("%s%s%s%s tcp_header_len=%d tcp_datalen=%d seq=%u tcp=%p", + (syn_set?"SYN ":""),(ack_set?"ACK ":""),(fin_set?"FIN ":""),(rst_set?"RST ":""),(int)tcp_header_len,(int)tcp_datalen,(int)seq,tcp); /* If this_flow is not in the database and the start_new_connections flag is false, just return */ if(tcp==0 && start_new_connections==false) return 0; @@ -407,6 +413,7 @@ if(tcp==0){ if(tcp_datalen==0){ // zero length packet if(fin_set) return 0; // FIN on a connection that's unknown; safe to ignore + if(rst_set) return 0; // RST on a connection that's unknown; safe to ignore if(syn_set==false && ack_set==false) return 0; // neither a SYN nor ACK; return } else { /* Data present on a flow that is not actively being demultiplexed. @@ -527,11 +534,16 @@ tcp->print_packet(tcp_data, tcp_datalen); } else { if (opt.store_output){ - tcp->store_packet(tcp_data, tcp_datalen, delta); + tcp->store_packet(tcp_data, tcp_datalen, delta,pi.ts); } } } + if (rst_set){ + remove_flow(this_flow); // take it out of the map + return 0; + } + /* Count the FINs. * If this is a fin, determine the size of the stream */ diff -Nru tcpflow-1.4.4+repack1/src/tcpdemux.h tcpflow-1.4.5+repack1/src/tcpdemux.h --- tcpflow-1.4.4+repack1/src/tcpdemux.h 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/tcpdemux.h 2015-08-26 03:35:47.000000000 +0000 @@ -88,13 +88,16 @@ class options { public:; enum { MAX_SEEK=1024*1024*16 }; - options():console_output(false),store_output(true),opt_md5(false), + options():console_output(false),console_output_nonewline(false), + store_output(true),opt_md5(false), post_processing(false),gzip_decompress(true), max_bytes_per_flow(), max_flows(0),suppress_header(0), - output_strip_nonprint(true),output_hex(false),use_color(0),max_seek(MAX_SEEK){ + output_strip_nonprint(true),output_hex(false),use_color(0), + output_packet_index(false),max_seek(MAX_SEEK) { } bool console_output; + bool console_output_nonewline; bool store_output; // do we output? bool opt_md5; // do we calculate MD5 on DFXML output? bool post_processing; // decode headers after tcp connection closes @@ -105,6 +108,8 @@ bool output_strip_nonprint; bool output_hex; bool use_color; + bool output_packet_index; // Generate a packet index file giving the timestamp and location + // bytes written to the flow file. int32_t max_seek; // signed becuase we compare with abs() }; diff -Nru tcpflow-1.4.4+repack1/src/tcpflow.cpp tcpflow-1.4.5+repack1/src/tcpflow.cpp --- tcpflow-1.4.4+repack1/src/tcpflow.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/tcpflow.cpp 2015-08-26 03:35:28.000000000 +0000 @@ -25,9 +25,6 @@ #include #include - - - /* bring in inet_ntop if it is not present */ #define ETH_ALEN 6 #ifndef HAVE_INET_NTOP @@ -72,7 +69,9 @@ scan_http, scan_netviz, scan_tcpdemux, +#ifdef USE_WIFI scan_wifiviz, +#endif 0}; bool opt_no_promisc = false; // true if we should not use promiscious mode @@ -81,84 +80,80 @@ *** USAGE ****************************************************************/ -static int usage_count = 0; -static void usage() +static void usage(int level) { - switch(++usage_count){ - case 1: - std::cout << PACKAGE_NAME << " version " << PACKAGE_VERSION << "\n\n"; - std::cout << "usage: " << progname << " [-aBcCDhJpsvVZ] [-b max_bytes] [-d debug_level] \n"; - std::cout << " [-[eE] scanner] [-f max_fds] [-F[ctTXMkmg]] [-i iface] [-L semlock]\n"; - std::cout << " [-m min_bytes] [-o outdir] [-r file] [-R file] \n"; - std::cout << " [-S name=value] [-T template] [-w file] [-x scanner] [-X xmlfile]\n"; - std::cout << " [expression]\n\n"; - std::cout << " -a: do ALL post-processing.\n"; - std::cout << " -b max_bytes: max number of bytes per flow to save\n"; - std::cout << " -d debug_level: debug level; default is " << DEFAULT_DEBUG_LEVEL << "\n"; - std::cout << " -f: maximum number of file descriptors to use\n"; - std::cout << " -h: print this help message (-hh for more help)\n"; - std::cout << " -H: print detailed information about each scanner\n"; - std::cout << " -i: network interface on which to listen\n"; - std::cout << " -J: output each flow in alternating colors (note change!)\n"; - std::cout << " -l: treat non-flag arguments as input files rather than a pcap expression\n"; - std::cout << " -L semlock - specifies that writes are locked using a named semaphore\n"; - std::cout << " -p: don't use promiscuous mode\n"; - std::cout << " -q: quiet mode - do not print warnings\n"; - std::cout << " -r file: read packets from tcpdump pcap file (may be repeated)\n"; - std::cout << " -R file: read packets from tcpdump pcap file TO FINISH CONNECTIONS\n"; - std::cout << " -v: verbose operation equivalent to -d 10\n"; - std::cout << " -V: print version number and exit\n"; - std::cout << " -w file: write packets not processed to file\n"; - std::cout << " -o outdir : specify output directory (default '.')\n"; - std::cout << " -X filename : DFXML output to filename\n"; - std::cout << " -m bytes : specifies skip that starts a new stream (default " - << (unsigned)tcpdemux::options::MAX_SEEK << ").\n"; - std::cout << " -F{p} : filename prefix/suffix (-hh for options)\n"; - std::cout << " -T{t} : filename template (-hh for options; default " - << flow::filename_template << ")\n"; - std::cout << " -Z: do not decompress gzip-compressed HTTP transactions\n"; - - std::cout << "\nControl of Scanners:\n"; - std::cout << " -E scanner - turn off all scanners except scanner\n"; - std::cout << " -S name=value Set a configuration parameter (-hh for info)\n"; - be13::plugin::info_scanners(false,true,scanners_builtin,'e','x'); - - std::cout << "Console output options:\n"; - std::cout << " -B: binary output, even with -c or -C (normally -c or -C turn it off)\n"; - std::cout << " -c: console print only (don't create files)\n"; - std::cout << " -C: console print only, but without the display of source/dest header\n"; - std::cout << " -s: strip non-printable characters (change to '.')\n"; - std::cout << " -D: output in hex (useful to combine with -c or -C)\n"; - std::cout << "\n"; + std::cout << PACKAGE_NAME << " version " << PACKAGE_VERSION << "\n\n"; + std::cout << "usage: " << progname << " [-aBcCDhJpsvVZ] [-b max_bytes] [-d debug_level] \n"; + std::cout << " [-[eE] scanner] [-f max_fds] [-F[ctTXMkmg]] [-i iface] [-L semlock]\n"; + std::cout << " [-m min_bytes] [-o outdir] [-r file] [-R file] \n"; + std::cout << " [-S name=value] [-T template] [-w file] [-x scanner] [-X xmlfile]\n"; + std::cout << " [expression]\n\n"; + std::cout << " -a: do ALL post-processing.\n"; + std::cout << " -b max_bytes: max number of bytes per flow to save\n"; + std::cout << " -d debug_level: debug level; default is " << DEFAULT_DEBUG_LEVEL << "\n"; + std::cout << " -f: maximum number of file descriptors to use\n"; + std::cout << " -h: print this help message (-hh for more help)\n"; + std::cout << " -H: print detailed information about each scanner\n"; + std::cout << " -i: network interface on which to listen\n"; + std::cout << " -I: generate temporal packet-> byte index files for each flow (.findex)\n"; + std::cout << " -g: output each flow in alternating colors (note change!)\n"; + std::cout << " -l: treat non-flag arguments as input files rather than a pcap expression\n"; + std::cout << " -L semlock - specifies that writes are locked using a named semaphore\n"; + std::cout << " -p: don't use promiscuous mode\n"; + std::cout << " -q: quiet mode - do not print warnings\n"; + std::cout << " -r file: read packets from tcpdump pcap file (may be repeated)\n"; + std::cout << " -R file: read packets from tcpdump pcap file TO FINISH CONNECTIONS\n"; + std::cout << " -v: verbose operation equivalent to -d 10\n"; + std::cout << " -V: print version number and exit\n"; + std::cout << " -w file: write packets not processed to file\n"; + std::cout << " -o outdir : specify output directory (default '.')\n"; + std::cout << " -X filename : DFXML output to filename\n"; + std::cout << " -m bytes : specifies skip that starts a new stream (default " + << (unsigned)tcpdemux::options::MAX_SEEK << ").\n"; + std::cout << " -F{p} : filename prefix/suffix (-hh for options)\n"; + std::cout << " -T{t} : filename template (-hh for options; default " + << flow::filename_template << ")\n"; + std::cout << " -Z: do not decompress gzip-compressed HTTP transactions\n"; + + std::cout << "\nControl of Scanners:\n"; + std::cout << " -E scanner - turn off all scanners except scanner\n"; + std::cout << " -S name=value Set a configuration parameter (-hh for info)\n"; + be13::plugin::info_scanners(false,true,scanners_builtin,'e','x'); + + std::cout << "Console output options:\n"; + std::cout << " -B: binary output, even with -c or -C (normally -c or -C turn it off)\n"; + std::cout << " -c: console print only (don't create files)\n"; + std::cout << " -C: console print only, but without the display of source/dest header\n"; + std::cout << " -0: don't print newlines after packets when printing to console"; + std::cout << " -s: strip non-printable characters (change to '.')\n"; + std::cout << " -D: output in hex (useful to combine with -c or -C)\n"; + std::cout << "\n"; #ifndef HAVE_LIBCAIRO - std::cout << "Rendering not available because Cairo was not installed.\n\n"; + std::cout << "Rendering not available because Cairo was not installed.\n\n"; #endif - std::cout << "expression: tcpdump-like filtering expression\n"; - std::cout << "\nSee the man page for additional information.\n\n"; - break; - case 2: - std::cout << "Filename Prefixes:\n"; - std::cout << " -Fc : append the connection counter to ALL filenames\n"; - std::cout << " -Ft : prepend the time_t timestamp to ALL filenames\n"; - std::cout << " -FT : prepend the ISO8601 timestamp to ALL filenames\n"; - std::cout << " -FX : Do not output any files (other than report files)\n"; - std::cout << " -FM : Calculate the MD5 for every flow (stores in DFXML)\n"; - std::cout << " -Fk : Bin output in 1K directories\n"; - std::cout << " -Fm : Bin output in 1M directories (2 levels)\n"; - std::cout << " -Fg : Bin output in 1G directories (3 levels)\n"; - flow::usage(); - std::cout << "-S name=value options:\n"; - for(int i=0;defaults[i].name;i++){ - std::stringstream ss; - ss << defaults[i].name << "=" << defaults[i].dvalue; - printf(" %-20s %s\n",ss.str().c_str(),defaults[i].help); - } - std::cout << "\n"; - std::cout << "DEBUG Levels (specify with -dNN):\n"; - std::cout << "get_max_fds() = " << tcpdemux::getInstance()->get_max_fds() << "\n"; - std::cout << "NUM_RESERVED_FDS = " << NUM_RESERVED_FDS << "\n"; - break; - } + std::cout << "expression: tcpdump-like filtering expression\n"; + std::cout << "\nSee the man page for additional information.\n\n"; + if(level<2) return; + std::cout << "Filename Prefixes:\n"; + std::cout << " -Fc : append the connection counter to ALL filenames\n"; + std::cout << " -Ft : prepend the time_t timestamp to ALL filenames\n"; + std::cout << " -FT : prepend the ISO8601 timestamp to ALL filenames\n"; + std::cout << " -FX : Do not output any files (other than report files)\n"; + std::cout << " -FM : Calculate the MD5 for every flow (stores in DFXML)\n"; + std::cout << " -Fk : Bin output in 1K directories\n"; + std::cout << " -Fm : Bin output in 1M directories (2 levels)\n"; + std::cout << " -Fg : Bin output in 1G directories (3 levels)\n"; + flow::usage(); + std::cout << "-S name=value options:\n"; + for(int i=0;defaults[i].name;i++){ + std::stringstream ss; + ss << defaults[i].name << "=" << defaults[i].dvalue; + printf(" %-20s %s\n",ss.str().c_str(),defaults[i].help); + } + std::cout << "\n"; + std::cout << "DEBUG Levels (specify with -dNN):\n"; + std::cout << "get_max_fds() = " << tcpdemux::getInstance()->get_max_fds() << "\n"; + std::cout << "NUM_RESERVED_FDS = " << NUM_RESERVED_FDS << "\n"; } /** @@ -283,7 +278,9 @@ * May be repeated. * If start is false, do not initiate new connections */ +#ifdef HAVE_INFLATER static inflaters_t *inflaters = 0; +#endif static void process_infile(const std::string &expression,const char *device,const std::string &infile) { char error[PCAP_ERRBUF_SIZE]; @@ -291,7 +288,9 @@ int dlt=0; pcap_handler handler; +#ifdef HAVE_INFLATER if(inflaters==0) inflaters = build_inflaters(); +#endif if (infile!=""){ std::string file_path = infile; @@ -373,11 +372,32 @@ } +/* be_hash. Currently this just returns the MD5 of the sbuf, + * but eventually it will allow the use of different hashes. + */ +static std::string be_hash_name("md5"); +static std::string be_hash_func(const uint8_t *buf,size_t bufsize) +{ + if(be_hash_name=="md5" || be_hash_name=="MD5"){ + return md5_generator::hash_buf(buf,bufsize).hexdigest(); + } + if(be_hash_name=="sha1" || be_hash_name=="SHA1" || be_hash_name=="sha-1" || be_hash_name=="SHA-1"){ + return sha1_generator::hash_buf(buf,bufsize).hexdigest(); + } + if(be_hash_name=="sha256" || be_hash_name=="SHA256" || be_hash_name=="sha-256" || be_hash_name=="SHA-256"){ + return sha256_generator::hash_buf(buf,bufsize).hexdigest(); + } + std::cerr << "Invalid hash name: " << be_hash_name << "\n"; + std::cerr << "This version of bulk_extractor only supports MD5, SHA1, and SHA256\n"; + exit(1); +} +static feature_recorder_set::hash_def be_hash(be_hash_name,be_hash_func); int main(int argc, char *argv[]) { - bool didhelp = false; + int opt_help = 0; + int opt_Help = 0; feature_recorder::set_main_threadid(); sbuf_t::set_map_file_delimiter(""); // no delimiter on carving #ifdef BROKEN @@ -387,8 +407,9 @@ std::cerr << "\n"; #endif + bool opt_enable_report = true; bool force_binary_output = false; - const char *device = ""; + const char *device = 0; // default device const char *lockname = 0; int need_usage = 0; std::string reportfilename; @@ -414,7 +435,7 @@ bool trailing_input_list = false; int arg; - while ((arg = getopt(argc, argv, "aA:Bb:cCd:DE:e:E:F:f:Hhi:JlL:m:o:pqR:r:S:sT:Vvw:x:X:Z")) != EOF) { + while ((arg = getopt(argc, argv, "aA:Bb:cCd:DE:e:E:F:f:gHhIi:lL:m:o:pqR:r:S:sT:Vvw:x:X:Z:0")) != EOF) { switch (arg) { case 'a': demux.opt.post_processing = true; @@ -444,6 +465,9 @@ case 'c': demux.opt.console_output = true; DEBUG(10) ("printing packets to console only"); break; + case '0': + demux.opt.console_output_nonewline = true; + break; case 'd': if ((debug = atoi(optarg)) < 0) { debug = DEFAULT_DEBUG_LEVEL; @@ -487,7 +511,11 @@ break; } case 'i': device = optarg; break; - case 'J': + case 'I': + DEBUG(10) ("creating packet index files"); + demux.opt.output_packet_index = true; + break; + case 'g': demux.opt.use_color = 1; DEBUG(10) ("using colors"); break; @@ -530,33 +558,44 @@ case 'x': be13::plugin::scanners_disable(optarg);break; case 'X': reportfilename = optarg;break; case 'Z': demux.opt.gzip_decompress = 0; break; - case 'H': - be13::plugin::info_scanners(true,true,scanners_builtin,'e','x'); - didhelp = true; - break; - case 'h': case '?': - usage(); - didhelp = true; - break; + case 'H': opt_Help += 1; break; + case 'h': opt_help += 1; break; default: DEBUG(1) ("error: unrecognized switch '%c'", arg); - need_usage = 1; + opt_help += 1; break; } } - if(didhelp) exit(0); + + argc -= optind; + argv += optind; + + + /* Load all the scanners and enable the ones we care about */ + scanner_info si; + si.config = &be_config; + + si.get_config("enable_report",&opt_enable_report,"Enable report.xml"); + be13::plugin::load_scanners(scanners_builtin,be_config); + + if(opt_Help){ + be13::plugin::info_scanners(true,true,scanners_builtin,'e','x'); + exit(0); + } + + if(opt_help) { + usage(opt_help); + exit(0); + } + + if(demux.opt.post_processing && !demux.opt.store_output){ std::cerr << "ERROR: post_processing currently requires storing output.\n"; exit(1); } - argc -= optind; - argv += optind; - - /* Load all the scanners and enable the ones we care about */ if(demux.opt.opt_md5) be13::plugin::scanners_enable("md5"); - be13::plugin::load_scanners(scanners_builtin,be_config); be13::plugin::scanners_process_enable_disable_commands(); /* If there is no report filename, call it report.xml in the output directory */ @@ -564,12 +603,6 @@ reportfilename = demux.outdir + "/" + DEFAULT_REPORT_FILENAME; } - /* print help and exit if there was an error in the arguments */ - if (need_usage) { - usage(); - exit(1); - } - /* remaining arguments are either an input list (-l flag) or a pcap expression (default) */ std::string expression = ""; if(trailing_input_list) { @@ -621,7 +654,7 @@ } /* report file specified? */ - if(reportfilename.size()>0){ + if(reportfilename.size()>0 && opt_enable_report){ xreport = new dfxml_writer(reportfilename,false); dfxml_create(*xreport,command_line); demux.xreport = xreport; @@ -635,8 +668,6 @@ demux.save_unk_packets(opt_unk_packets,input_fname); } - scanner_info si; - si.config = &be_config; /* Debug prefix set? */ std::string debug_prefix=progname; @@ -645,11 +676,14 @@ DEBUG(10) ("%s version %s ", PACKAGE_NAME, PACKAGE_VERSION); + const char *name = device; + if(input_fname.size()>0) name=input_fname.c_str(); + if(name==0) name=""; + feature_file_names_t feature_file_names; be13::plugin::get_scanner_feature_file_names(feature_file_names); - feature_recorder_set fs(0); - - fs.init(feature_file_names,input_fname.size()>0 ? input_fname : device,demux.outdir); + feature_recorder_set fs(feature_recorder_set::NO_ALERT,be_hash,name,demux.outdir); + fs.init(feature_file_names); the_fs = &fs; demux.fs = &fs; diff -Nru tcpflow-1.4.4+repack1/src/tcpflow.h tcpflow-1.4.5+repack1/src/tcpflow.h --- tcpflow-1.4.4+repack1/src/tcpflow.h 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/tcpflow.h 2015-08-26 03:34:50.000000000 +0000 @@ -34,7 +34,7 @@ *** If we are compiling for Windows, including the Windows-specific *** include files first and disable pthread support. ***/ -#ifdef WIN32 +#if (defined(WIN32) || defined(__MINGW32__)) # undef HAVE_PTHREAD_H # undef HAVE_SEMAPHORE_H # undef HAVE_PTHREAD @@ -233,6 +233,10 @@ #undef s6_addr32 #define s6_addr32 __u6_addr.__u6_addr32 +#ifdef __MINGW32__ +typedef uint16_t in_port_t; +typedef unsigned char u_int8_t; +#endif /**************************** Constants ***********************************/ @@ -259,6 +263,11 @@ const char *inet_ntop(int af, const void *src,char *dst, socklen_t size); #endif +#if defined(__MINGW32__) +// has this prototype for ws2_32 dll, but has type-conflicts with winsock2.h +WINSOCK_API_LINKAGE LPCWSTR WSAAPI inet_ntop(INT Family, PVOID pAddr, LPWSTR pStringBuf, size_t StringBufSIze); +#endif + #ifdef HAVE_PTHREAD #include extern sem_t *semlock; diff -Nru tcpflow-1.4.4+repack1/src/tcpip.cpp tcpflow-1.4.5+repack1/src/tcpip.cpp --- tcpflow-1.4.4+repack1/src/tcpip.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/tcpip.cpp 2015-08-26 03:35:47.000000000 +0000 @@ -2,6 +2,11 @@ * This file is part of tcpflow by Simson Garfinkel, * originally by Jeremy Elson * + * Modified by Greg Drew to add support for creating a packet time / data index + * which allows mapping bytes in the flow back to their relative arrival time. + * This is very useful in reassembling inherently bidirectional conversations + * such as chat or telnet sessions. --GDD + * * This source code is under the GNU Public License (GPL). See * LICENSE for details. * @@ -13,6 +18,8 @@ #include #include +#include +#include #pragma GCC diagnostic ignored "-Weffc++" #pragma GCC diagnostic ignored "-Wshadow" @@ -29,6 +36,7 @@ demux(demux_),myflow(flow_),dir(unknown),isn(isn_),nsn(0), syn_count(0),fin_count(0),fin_size(0),pos(0), flow_pathname(),fd(-1),file_created(false), + flow_index_pathname(),idx_file(), seen(new recon_set()), last_byte(), last_packet_number(),out_of_order_count(0),violations(0) @@ -144,6 +152,11 @@ fd = -1; } demux.open_flows.erase(this); // we are no longer open + // Also close the flow_index file, if flow indexing is in use --GDD + if(demux.opt.output_packet_index && idx_file.is_open()){ + idx_file.close(); + } + //std::cerr << "close_file1 " << *this << "\n"; } /* @@ -154,12 +167,14 @@ int tcpip::open_file() { + int create_idx_needed = false; if(fd<0){ //std::cerr << "open_file0 " << ct << " " << *this << "\n"; /* If we don't have a filename, create the flow */ if(flow_pathname.size()==0) { - flow_pathname = myflow.new_filename(&fd,O_RDWR|O_BINARY|O_CREAT,0666); + flow_pathname = myflow.new_filename(&fd,O_RDWR|O_BINARY|O_CREAT|O_EXCL,0666); file_created = true; // remember we made it + create_idx_needed = true; // We created a new stream, so we need to create a new flow file. --GDD DEBUG(5) ("%s: created new file",flow_pathname.c_str()); } else { /* open an existing flow */ @@ -181,6 +196,30 @@ if(demux.open_flows.size() > demux.max_open_flows) demux.max_open_flows = demux.open_flows.size(); //std::cerr << "open_file1 " << *this << "\n"; } + if(demux.opt.output_packet_index){ + //Open the file for the flow index. We don't do this if the flow file could not be + // opened. The file must be opened for append, in case this is a reopen. The filename + // standard is the flow name followed by ".findx", which google currently says does not + // conflict with anything major. + flow_index_pathname = flow_pathname + ".findx"; + DEBUG(10)("opening index file: %s",flow_index_pathname.c_str()); + if(create_idx_needed){ + //New flow file, even if there was an old one laying around --GDD + idx_file.open(flow_index_pathname.c_str(),std::ios::trunc|std::ios::in|std::ios::out); + }else{ + //Use existing flow file --GDD + idx_file.open(flow_index_pathname.c_str(),std::ios::ate|std::ios::in|std::ios::out); + } + if(idx_file.bad()){ + perror(flow_index_pathname.c_str()); + // Be nice and be sure the flow has been closed in the demultiplexer. + // demux.close_tcpip_fd(this); Need to fix this. Also, when called, it will + // have to differentiate the fact that the open fd cound only needs to be + // decremented by one and not by 2.--GDD + return -1; + } + + } return 0; } @@ -230,8 +269,10 @@ /* Print the offset */ char b[64]; - int count = snprintf(b,sizeof(b),"%04x: ",(int)i); - fwrite(b,1,count,stdout); + size_t count = snprintf(b,sizeof(b),"%04x: ",(int)i); + if(fwrite(b,1,count,stdout)!=count){ + perror("fwrite"); + } spaces += count; /* Print the hext bytes */ @@ -262,20 +303,27 @@ else if(demux.opt.output_strip_nonprint){ for(const u_char *cc = data;cc=0) lseek(fd,(off_t)delta,SEEK_CUR); if(delta<0) out_of_order_count++; // only increment for backwards seeks DEBUG(25)("%s: lseek(%d,%d,SEEK_CUR) offset=%" PRId64 " pos=%" PRId64 " out_of_order_count=%" PRId64, @@ -435,6 +489,17 @@ DEBUG(1) ("write to %s failed: ", flow_pathname.c_str()); if (debug >= 1) perror(""); } + // Write to the index file if needed. Note, index file is sorted before close, so no need to jump around --GDD + if (demux.opt.output_packet_index && idx_file.is_open()) { + idx_file << offset << "|" << ts.tv_sec << "." << std::setw(6) << std::setfill('0') << ts.tv_usec << "|" + << wlength << "\n"; + if (idx_file.bad()){ + DEBUG(1)("write to index file %s failed: ",flow_index_pathname.c_str()); + if(debug >= 1){ + perror(""); + } + } + } if(wlength != length){ off_t p = lseek(fd,length-wlength,SEEK_CUR); // seek out the space we didn't write DEBUG(100)(" lseek(%" PRId64 ",SEEK_CUR)=%" PRId64,(int64_t)(length-wlength),(int64_t)p); @@ -462,6 +527,65 @@ #endif } +/* + * Compare two index strings and return the result. Called by + * the vector::sort in sort_index. + * --GDD + */ +bool tcpip::compare(std::string a, std::string b){ + std::stringstream ss_a(a),ss_b(b); + long a_l,b_l; + + ss_a >> a_l; + ss_b >> b_l; + return a_l < b_l; +} + +/* + * Sort an index file (presumably from this object) if file indexing is + * turned on and the file exists. Index files may be out of order due + * to the arrival of out of order packets. It is cheaper to reorder them + * one time at the end of processing than it is to continually keep them + * in order. + * --GDD + */ +void tcpip::sort_index(std::fstream *ix_file) { + + std::vector idx; + std::string line; + + if (demux.opt.output_packet_index) { + if (!(idx_file.good() && idx_file.is_open())) { + DEBUG(5)("Skipping index file sort. Unusual behavior.\n"); + return; //Nothing to do + } + //Make sure we are at the beginning. + ix_file->clear(); + ix_file->seekg(0); + do { + *ix_file >> line; + if (!ix_file->eof()) { + idx.push_back(line); + } + } while (ix_file->good()); + std::sort(idx.begin(), idx.end(), &tcpip::compare); + ix_file->clear(); + ix_file->seekg(0); + for (std::vector::iterator s = idx.begin(); s != idx.end(); + s++) { + *ix_file << *s << "\n"; + } + } +} + +/* + * Convenience function to cause the local index file to be sorted. + * --GDD + */ +void tcpip::sort_index(){ + tcpip::sort_index(&(this->idx_file)); +} + #pragma GCC diagnostic ignored "-Weffc++" #pragma GCC diagnostic ignored "-Wshadow" diff -Nru tcpflow-1.4.4+repack1/src/tcpip.h tcpflow-1.4.5+repack1/src/tcpip.h --- tcpflow-1.4.4+repack1/src/tcpip.h 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/tcpip.h 2015-08-26 03:34:50.000000000 +0000 @@ -1,6 +1,8 @@ #ifndef TCPIP_H #define TCPIP_H +#include + /** On windows, there is no in_addr_t; this is from * /usr/include/netinet/in.h */ @@ -51,12 +53,10 @@ inline bool operator >=(const ipaddr &b) const { return memcmp(this->addr,b.addr,sizeof(addr))>=0; }; inline bool operator < (const ipaddr &b) const { return memcmp(this->addr,b.addr,sizeof(this->addr))<0; } -#pragma GCC diagnostic ignored "-Wcast-align" - inline bool isv4() const { // is this an IPv6 address? - uint32_t *i = (uint32_t *)((uint8_t *)&addr); - return i[1]==0 && i[2]==0 && i[3]==0; + // We represent IPv4 addresses as 4 octets of address followed by 12 octets of 0. + inline bool isv4() const { + return quad(4)==0 && quad(8)==0 && quad(12)==0; } -#pragma GCC diagnostic warning "-Wcast-align" }; inline std::ostream & operator <<(std::ostream &os,const ipaddr &b) { @@ -93,7 +93,6 @@ uint16_t dport; // Destination port number sa_family_t family; // AF_INET or AF_INET6 */ -#pragma GCC diagnostic ignored "-Wcast-align" uint64_t hash() const { if(family==AF_INET){ return ((uint64_t)(src.quad(0))<<32 | dst.quad(0)) @@ -101,12 +100,11 @@ ^ (sport<<16 | dport); } else { return (src.dquad(0)<<32 ^ dst.dquad(0)) - ^ (dst.dquad(0)<<32 ^ src.dquad(0)) - ^ (src.dquad(1) ^ dst.dquad(1)) + ^ (dst.dquad(0)<<32 ^ src.dquad(0)) + ^ (src.dquad(1) ^ dst.dquad(1)) ^ (sport<<16 | dport); } } -#pragma GCC diagnostic warning "-Wcast-align" inline bool operator ==(const flow_addr &b) const { return this->src==b.src && @@ -294,6 +292,10 @@ int fd; // file descriptor for file storing this flow's data bool file_created; // true if file was created + /* Flow Index information - only used if flow packet/data indexing is requested --GDD */ + std::string flow_index_pathname; // Path for the flow index file + std::fstream idx_file; // File descriptor for storing the flow index data + /* Stats */ recon_set *seen; // what we've seen; it must be * due to boost lossage uint64_t last_byte; // last byte in flow processed @@ -305,11 +307,14 @@ void close_file(); // close fd int open_file(); // opens save file; return -1 if failure, 0 if success void print_packet(const u_char *data, uint32_t length); - void store_packet(const u_char *data, uint32_t length, int32_t delta); + void store_packet(const u_char *data, uint32_t length, int32_t delta,struct timeval ts); void process_packet(const struct timeval &ts,const int32_t delta,const u_char *data,const uint32_t length); uint32_t seen_bytes(); void dump_seen(); void dump_xml(class dfxml_writer *xmlreport,const std::string &xmladd); + static bool compare(std::string a, std::string b); + void sort_index(std::fstream *idx_file); + void sort_index(); }; /* print a tcpip data structure. Largely for debugging */ diff -Nru tcpflow-1.4.4+repack1/src/util.cpp tcpflow-1.4.5+repack1/src/util.cpp --- tcpflow-1.4.4+repack1/src/util.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/util.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -87,6 +87,7 @@ /****************************************************************/ /* C++ string splitting code from http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c */ +#if 0 static std::vector &split(const std::string &s, char delim, std::vector &elems) { std::stringstream ss(s); @@ -102,7 +103,7 @@ std::vector elems; return split(s, delim, elems); } - +#endif /* mkdir all of the containing directories in path. diff -Nru tcpflow-1.4.4+repack1/src/wifipcap/ethertype.h tcpflow-1.4.5+repack1/src/wifipcap/ethertype.h --- tcpflow-1.4.4+repack1/src/wifipcap/ethertype.h 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/wifipcap/ethertype.h 2015-08-26 03:34:50.000000000 +0000 @@ -143,5 +143,4 @@ #define ETHERTYPE_ISO 0xfefe /* nonstandard - used in Cisco HDLC encapsulation */ #endif -//extern const struct tok ethertype_values[]; #endif diff -Nru tcpflow-1.4.4+repack1/src/wifipcap/TimeVal.h tcpflow-1.4.5+repack1/src/wifipcap/TimeVal.h --- tcpflow-1.4.4+repack1/src/wifipcap/TimeVal.h 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/wifipcap/TimeVal.h 2015-08-26 03:34:50.000000000 +0000 @@ -44,7 +44,7 @@ #include #include #else -#include +#include #endif #include "types.h" diff -Nru tcpflow-1.4.4+repack1/src/wifipcap/wifipcap.cpp tcpflow-1.4.5+repack1/src/wifipcap/wifipcap.cpp --- tcpflow-1.4.4+repack1/src/wifipcap/wifipcap.cpp 2014-01-10 05:17:29.000000000 +0000 +++ tcpflow-1.4.5+repack1/src/wifipcap/wifipcap.cpp 2015-08-26 03:34:50.000000000 +0000 @@ -78,6 +78,7 @@ const char *s; /* string */ }; +#if 0 static const struct tok ethertype_values[] = { { ETHERTYPE_IP, "IPv4" }, { ETHERTYPE_MPLS, "MPLS unicast" }, @@ -115,6 +116,7 @@ { ETHERTYPE_GRE_ISO, "GRE-OSI" }, { 0, NULL} }; +#endif /*max length of an IEEE 802.11 packet*/ #ifndef MAX_LEN_80211 @@ -1735,7 +1737,12 @@ const char *Wifipcap::SetFilter(const char *filter) { struct bpf_program fp; +#ifdef PCAP_NETMASK_UNKNOWN bpf_u_int32 netp=PCAP_NETMASK_UNKNOWN; +#else + bpf_u_int32 netp=0; +#endif + if(pcap_compile(descr,&fp,(char *)filter,0,netp) == -1) { return "Error calling pcap_compile"; Binary files /tmp/uVvMFr2Tmv/tcpflow-1.4.4+repack1/tests/bug5.pcap and /tmp/ioQinB_t6t/tcpflow-1.4.5+repack1/tests/bug5.pcap differ diff -Nru tcpflow-1.4.4+repack1/tests/Makefile.am tcpflow-1.4.5+repack1/tests/Makefile.am --- tcpflow-1.4.4+repack1/tests/Makefile.am 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/tests/Makefile.am 2015-08-26 03:34:50.000000000 +0000 @@ -8,10 +8,12 @@ # About the test files: # -EXTRA_DIST = test1.sh test1.pcap test2.pcap test3.pcap test4.pcap \ - test-pdfs.sh test-multifile.sh test-iptree.sh +SH_TESTS = test1.sh test-pdfs.sh test-multifile.sh test-iptree.sh + +EXTRA_DIST = $(SH_TESTS) test-subs.sh test1.pcap test2.pcap test3.pcap test4.pcap + +TESTS = $(SH_TESTS) -TESTS = test1.sh test-pdfs.sh test-multifile.sh test-iptree.sh CLEANFILES = \ out/010.000.000.001.09999-010.000.000.002.36559--42 \ out/010.000.000.002.36559-010.000.000.001.09999--42 \ diff -Nru tcpflow-1.4.4+repack1/tests/test1.sh tcpflow-1.4.5+repack1/tests/test1.sh --- tcpflow-1.4.4+repack1/tests/test1.sh 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/tests/test1.sh 2015-08-26 03:34:50.000000000 +0000 @@ -3,53 +3,7 @@ # test to make sure that we can process the packets normally # -case x"$srcdir" in - x) - echo No srcdir specified. Assuming $0 is run locally - DMPDIR=. - TCPFLOW=../src/tcpflow - ;; - x.) - echo srcdir is . Assuming $0 is run locally from make check - DMPDIR=. - TCPFLOW=../src/tcpflow - ;; - *) - echo srcdir is $srcdir Assuming $0 is run from make distcheck - DMPDIR=../../tests/ - TCPFLOW=../../_build/src/tcpflow - ;; -esac - -echo DMPDIR=$DMPDIR -echo TCPFLOW=$TCPFLOW - -# check the results -checkmd5() -{ - if [ ! -r $1 ] ; - then - echo file $1 was not created - ls -l - exit 1 - fi - - md5val=`openssl md5 $1 | awk '{print $2;}'` - if [ x$2 != x$md5val ]; - then - echo failure: $1 - echo expected md5: $2 "(got '$md5val')" - echo expected length: $3 - ls -l $1 - exit 1 - fi -} - -cmd() -{ - echo $1 - if ! $1 ; then echo failed; exit 1; fi -} +. $srcdir/test-subs.sh for t in 1 2 3 do diff -Nru tcpflow-1.4.4+repack1/tests/test-iptree.sh tcpflow-1.4.5+repack1/tests/test-iptree.sh --- tcpflow-1.4.4+repack1/tests/test-iptree.sh 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/tests/test-iptree.sh 2015-08-26 03:34:50.000000000 +0000 @@ -1,20 +1,4 @@ -case x"$srcdir" in - x) - echo No srcdir specified. Assuming $0 is run locally - DMPDIR=. - TCPFLOW=../src/tcpflow - ;; - x.) - echo srcdir is . Assuming $0 is run locally from make check - DMPDIR=. - TCPFLOW=../src/tcpflow - ;; - *) - echo srcdir is $srcdir Assuming $0 is run from make distcheck - DMPDIR=../../tests/ - TCPFLOW=../../_build/src/tcpflow - ;; -esac +. $srcdir/test-subs.sh echo DMPDIR=$DMPDIR echo TCPFLOW=$TCPFLOW diff -Nru tcpflow-1.4.4+repack1/tests/test-multifile.sh tcpflow-1.4.5+repack1/tests/test-multifile.sh --- tcpflow-1.4.4+repack1/tests/test-multifile.sh 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/tests/test-multifile.sh 2015-08-26 03:34:50.000000000 +0000 @@ -1,60 +1,8 @@ #!/bin/sh # test the multifile -case x"$srcdir" in - x) - echo No srcdir specified. Assuming $0 is run locally - DMPDIR=. - TCPFLOW=../src/tcpflow - ;; - x.) - echo srcdir is . Assuming $0 is run locally from make check - DMPDIR=. - TCPFLOW=../src/tcpflow - ;; - *) - echo srcdir is $srcdir Assuming $0 is run from make distcheck - DMPDIR=../../tests/ - TCPFLOW=../../_build/src/tcpflow - ;; -esac +. $srcdir/test-subs.sh -echo DMPDIR=$DMPDIR -echo TCPFLOW=$TCPFLOW - -# check the results -checkmd5() -{ - if [ ! -r $1 ] ; - then - echo file $1 was not created - ls -l - exit 1 - fi - - md5val=`openssl md5 $1 | awk '{print $2;}'` - if [ x$2 != x$md5val ]; - then - echo failure: $1 - echo expected md5: $2 "(got '$md5val')" - echo expected length: $3 - ls -l $1 - exit 1 - fi -} - -testmd5() -{ - md5val=`openssl md5 $1 | awk '{print $2;}'` - len=`stat -r $1 | awk '{print $8;}'` - echo checkmd5 \"$1\" \"$md5val\" \"$len\" -} - -cmd() -{ - echo $1 - if ! $1 ; then echo failed; exit 1; fi -} # this test requires MULTIFILE MULTIFILE=/corp/nps/packets/2013-httpxfer/multifile_25_21.pcap diff -Nru tcpflow-1.4.4+repack1/tests/test-pdfs.sh tcpflow-1.4.5+repack1/tests/test-pdfs.sh --- tcpflow-1.4.4+repack1/tests/test-pdfs.sh 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/tests/test-pdfs.sh 2015-08-26 03:34:50.000000000 +0000 @@ -1,32 +1,14 @@ #!/bin/sh # -case x"$srcdir" in - x) - echo No srcdir specified. Assuming $0 is run locally - TCPFLOW=../src/tcpflow - ;; - x.) - echo srcdir is . Assuming $0 is run locally from make check - TCPFLOW=../src/tcpflow - ;; - *) - echo srcdir is $srcdir Assuming $0 is run from make distcheck - TCPFLOW=../../_build/src/tcpflow - ;; -esac - -cmd() { - echo $1; - $1; -} +. $srcdir/test-subs.sh # create PDFs for all of the pcap files -for i in *.pcap +for i in $DMPDIR/*.pcap do echo $i cmd "$TCPFLOW -Fg -e netviz -o tmp$$ -r $i" cmd "mv tmp$$/report.pdf `basename $i .pcap`.pdf" echo "" - /bin/rm -rf tmp$$ + /bin/rm -rf tmp$$ test?.pdf done diff -Nru tcpflow-1.4.4+repack1/tests/test-subs.sh tcpflow-1.4.5+repack1/tests/test-subs.sh --- tcpflow-1.4.4+repack1/tests/test-subs.sh 2014-01-10 05:19:15.000000000 +0000 +++ tcpflow-1.4.5+repack1/tests/test-subs.sh 2015-08-26 03:34:50.000000000 +0000 @@ -51,4 +51,5 @@ { echo $1 if ! $1 ; then echo failed; exit 1; fi -} \ No newline at end of file +} +