From f25a80acb6cd75ca3ebf12b5348007410c46cfbb Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Wed, 2 Mar 2016 12:25:36 -0600 Subject: [PATCH 01/11] Add CSV parsing Added input option for parsing CSV files --- iocp.py | 46 ++++++++++++++++++++++++++++++++++++++++++++-- output.py | 26 +++++++++++++++----------- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/iocp.py b/iocp.py index 9968cc2..fd9b6a6 100755 --- a/iocp.py +++ b/iocp.py @@ -40,6 +40,7 @@ import fnmatch import argparse import re +import csv from StringIO import StringIO try: import configparser as ConfigParser @@ -144,7 +145,18 @@ def is_whitelisted(self, ind_match, ind_type): pass return False - def parse_page(self, fpath, data, page_num): + def parse_page(self, fpath, data, page_num, flag=0): + """ Added flag and sheet_name variables for new inputs to help properly + print output + + @param fpath: the file path, directory, URL or email account + @param data: the data to be parse_pdf + @param page_num: the page number of a pdf, line number of csv, xls or xlsx + @param flag: + 0 = default (pdf/txt/html) + 2 = csv + @param sheet_name: to be used only with Excel spreadsheets + """ for ind_type, ind_regex in self.patterns.items(): matches = ind_regex.findall(data) @@ -164,7 +176,8 @@ def parse_page(self, fpath, data, page_num): self.dedup_store.add((ind_type, ind_match)) - self.handler.print_match(fpath, page_num, ind_type, ind_match) + # Added flag to determine which type of output to display + self.handler.print_match(fpath, page_num, ind_type, ind_match, flag) def parse_pdf_pypdf2(self, f, fpath): try: @@ -266,6 +279,35 @@ def parse_html(self, f, fpath): except Exception as e: self.handler.print_error(fpath, e) + def parse_csv(self, f, fpath): + """ This method is used to parse a csv file. The flag + used for this method to send to output.py is 2. + + @author Robb Krasnow + """ + + try: + if self.dedup: + self.dedup_store = set() + + self.handler.print_header(fpath) + + with open(fpath, 'rb') as csvfile: + csv_data = csv.reader(csvfile, delimiter=',', quotechar='|') + + for row in csv_data: + line = ', '.join(row).rstrip() + unicode_output = unicode(line, 'ascii', errors='ignore') + + self.parse_page(fpath, unicode_output, csv_data.line_num, 2) + + self.handler.print_footer(fpath) + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + self.handler.print_error(fpath, e) + + def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): diff --git a/output.py b/output.py index d71c92c..802f7af 100644 --- a/output.py +++ b/output.py @@ -33,21 +33,23 @@ class OutputHandler_csv(OutputHandler): def __init__(self): self.csv_writer = csv.writer(sys.stdout, delimiter = '\t') - def print_match(self, fpath, page, name, match): + # Added flag which are unused but needed to make CSV output work + def print_match(self, fpath, page, name, match, flag): self.csv_writer.writerow((fpath, page, name, match)) def print_error(self, fpath, exception): self.csv_writer.writerow((fpath, '0', 'error', exception)) class OutputHandler_json(OutputHandler): - def print_match(self, fpath, page, name, match): - data = { - 'path' : fpath, - 'file' : os.path.basename(fpath), - 'page' : page, - 'type' : name, - 'match': match - } + def print_match(self, fpath, page, name, match, flag): + if flag == 0 or flag == 2: + data = { + 'path' : fpath, + 'file' : os.path.basename(fpath), + 'page' : page, + 'type' : name, + 'match': match + } print(json.dumps(data)) @@ -65,7 +67,8 @@ class OutputHandler_yara(OutputHandler): def __init__(self): self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256)) - def print_match(self, fpath, page, name, match): + # Added flag which are unused but needed to make YARA output work + def print_match(self, fpath, page, name, match, flag): if name in self.cnt: self.cnt[name] += 1 else: @@ -97,7 +100,8 @@ class OutputHandler_netflow(OutputHandler): def __init__(self): print "host 255.255.255.255" - def print_match(self, fpath, page, name, match): + # Added flag which are unused but needed to make Netflow output work + def print_match(self, fpath, page, name, match, flag): data = { 'type' : name, 'match': match From 54703da40d6987bbc58c71286f42fba64bf860a8 Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Wed, 2 Mar 2016 14:48:10 -0600 Subject: [PATCH 02/11] Add XLS/XLSX parsing Added input options for parsing Excel XLS and XLSX files --- iocp.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++---- output.py | 32 +++++++++++++++++++++------- 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/iocp.py b/iocp.py index fd9b6a6..b6c21fd 100755 --- a/iocp.py +++ b/iocp.py @@ -49,6 +49,12 @@ # Import optional third-party libraries IMPORTS = [] +try: + import xlrd + IMPORTS.append('xlrd') +except ImportError: + pass + try: from PyPDF2 import PdfFileReader IMPORTS.append('pypdf2') @@ -112,6 +118,10 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' if 'beautifulsoup' not in IMPORTS: e = 'HTML parser library not found: BeautifulSoup' raise ImportError(e) + elif input_format == 'xlsx': + if 'xlrd' not in IMPORTS: + e = 'XLRD Library not found. Please visit: https://github.com/python-excel/xlrd' + raise ImportError(e) def load_patterns(self, fpath): config = ConfigParser.ConfigParser() @@ -145,7 +155,7 @@ def is_whitelisted(self, ind_match, ind_type): pass return False - def parse_page(self, fpath, data, page_num, flag=0): + def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''): """ Added flag and sheet_name variables for new inputs to help properly print output @@ -155,6 +165,7 @@ def parse_page(self, fpath, data, page_num, flag=0): @param flag: 0 = default (pdf/txt/html) 2 = csv + 3 = xls and xlsx @param sheet_name: to be used only with Excel spreadsheets """ for ind_type, ind_regex in self.patterns.items(): @@ -177,7 +188,7 @@ def parse_page(self, fpath, data, page_num, flag=0): self.dedup_store.add((ind_type, ind_match)) # Added flag to determine which type of output to display - self.handler.print_match(fpath, page_num, ind_type, ind_match, flag) + self.handler.print_match(fpath, page_num, ind_type, ind_match, flag, sheet_name) def parse_pdf_pypdf2(self, f, fpath): try: @@ -285,7 +296,6 @@ def parse_csv(self, f, fpath): @author Robb Krasnow """ - try: if self.dedup: self.dedup_store = set() @@ -308,6 +318,50 @@ def parse_csv(self, f, fpath): self.handler.print_error(fpath, e) + def parse_xls(self, f, fpath): + """ Created this function just to allow a user to use 'xls' as an input + option without any errors. + + @author Robb Krasnow + """ + self.parse_xlsx(f, fpath) + + + def parse_xlsx(self, f, fpath): + """ This method is used to parse Microsoft Excel files + with either .xls or .xlsx extentions. The flag + used for this method to send to output.py is 3. Because + Excel spreadsheets may have multiple tabs, the sheet's + name is passed through the parse_page method in turn showing + that in the output. + + @author Robb Krasnow + """ + try: + if self.dedup: + self.dedup_store = set() + + self.handler.print_header(fpath) + workbook = xlrd.open_workbook(fpath) + sheets = workbook.sheets() + + for sheet in sheets: + sheet_name = sheet.name + + for row in range(sheet.nrows): + for col in range(sheet.ncols): + if sheet.cell_value(row, col) is not xlrd.empty_cell.value: + val = repr(sheet.cell_value(row, col)) + + self.parse_page(fpath, val, row+1, 3, sheet_name) + + self.handler.print_footer(fpath) + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + self.handler.print_error(fpath, e) + + def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): @@ -343,7 +397,7 @@ def parse(self, path): argparser = argparse.ArgumentParser() argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)') argparser.add_argument('-p', dest='INI', default=None, help='Pattern file') - argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)') + argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/csv/xls/xlsx)') argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)') argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') diff --git a/output.py b/output.py index 802f7af..e3491e3 100644 --- a/output.py +++ b/output.py @@ -33,15 +33,22 @@ class OutputHandler_csv(OutputHandler): def __init__(self): self.csv_writer = csv.writer(sys.stdout, delimiter = '\t') - # Added flag which are unused but needed to make CSV output work - def print_match(self, fpath, page, name, match, flag): - self.csv_writer.writerow((fpath, page, name, match)) + # Added flag and sheet which are unused but needed to make CSV output work + def print_match(self, fpath, page, name, match, flag, sheet=''): + self.csv_writer.writerow((fpath, page, name, match, sheet)) def print_error(self, fpath, exception): self.csv_writer.writerow((fpath, '0', 'error', exception)) class OutputHandler_json(OutputHandler): - def print_match(self, fpath, page, name, match, flag): + """ + @param flag: + 0 = default (pdf/txt/html) + 2 = csv + 3 = xls and xlsx + @param sheet The sheet being parsed if Excel spreadsheet (single or multi-sheet) + """ + def print_match(self, fpath, page, name, match, flag, sheet=''): if flag == 0 or flag == 2: data = { 'path' : fpath, @@ -50,6 +57,15 @@ def print_match(self, fpath, page, name, match, flag): 'type' : name, 'match': match } + elif flag == 3: + data = { + 'path' : fpath, + 'file' : os.path.basename(fpath), + 'sheet' : sheet, + 'line' : page, + 'type' : name, + 'match': match, + } print(json.dumps(data)) @@ -67,8 +83,8 @@ class OutputHandler_yara(OutputHandler): def __init__(self): self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256)) - # Added flag which are unused but needed to make YARA output work - def print_match(self, fpath, page, name, match, flag): + # Added flag and sheet which are unused but needed to make YARA output work + def print_match(self, fpath, page, name, match, flag, sheet=''): if name in self.cnt: self.cnt[name] += 1 else: @@ -100,8 +116,8 @@ class OutputHandler_netflow(OutputHandler): def __init__(self): print "host 255.255.255.255" - # Added flag which are unused but needed to make Netflow output work - def print_match(self, fpath, page, name, match, flag): + # Added flag and sheet which are unused but needed to make Netflow output work + def print_match(self, fpath, page, name, match, flag, sheet=''): data = { 'type' : name, 'match': match From 996cb3627af1060b1e5623fe110e6c0555364a80 Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Wed, 2 Mar 2016 14:57:35 -0600 Subject: [PATCH 03/11] Added new inputs/instructions to README Added csv/xls/xlsx to input format. Updated output formats to include netflow. Added link and install instructions to download xlrd for xls/xlsx parsing. --- README.md | 9 ++++++--- iocp.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index eb37c17..2d40c6f 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ IOC Parser is a tool to extract indicators of compromise from security reports i **iocp.py [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE** * *FILE* File/directory path to report(s) * *-p INI* Pattern file -* *-i FORMAT* Input format (pdf/txt/html) -* *-o FORMAT* Output format (csv/json/yara) +* *-i FORMAT* Input format (pdf/txt/html/csv/xls/xlsx) +* *-o FORMAT* Output format (csv/json/yara/netflow) * *-d* Deduplicate matches * *-l LIB* Parsing library @@ -19,4 +19,7 @@ For HTML parsing support: * [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) - *pip install beautifulsoup4* For HTTP(S) support: -* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests* \ No newline at end of file +* [requests](http://docs.python-requests.org/en/latest/) - *pip install requests* + +For XLS/XLSX support: +* [xlrd](https://github.com/python-excel/xlrd) - *pip install xlrd* \ No newline at end of file diff --git a/iocp.py b/iocp.py index b6c21fd..d9cc1b6 100755 --- a/iocp.py +++ b/iocp.py @@ -160,7 +160,7 @@ def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''): print output @param fpath: the file path, directory, URL or email account - @param data: the data to be parse_pdf + @param data: the data to be parsed @param page_num: the page number of a pdf, line number of csv, xls or xlsx @param flag: 0 = default (pdf/txt/html) From 79b531855c98fbfee9e8e4bd689276469d843216 Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Wed, 2 Mar 2016 19:50:03 -0600 Subject: [PATCH 04/11] Added Gmail parsing Added parsing capability for a valid Gmail account. Updated README. --- README.md | 9 ++++--- iocp.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- output.py | 23 +++++++++++------- 3 files changed, 89 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 2d40c6f..97bb12b 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ IOC Parser is a tool to extract indicators of compromise from security reports i ## Usage **iocp.py [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE** -* *FILE* File/directory path to report(s) +* *FILE* File/directory path to report(s)/Gmail account in double quotes ("username@gmail.com password") * *-p INI* Pattern file -* *-i FORMAT* Input format (pdf/txt/html/csv/xls/xlsx) +* *-i FORMAT* Input format (pdf/txt/html/csv/xls/xlsx/gmail) * *-o FORMAT* Output format (csv/json/yara/netflow) * *-d* Deduplicate matches * *-l LIB* Parsing library @@ -22,4 +22,7 @@ For HTTP(S) support: * [requests](http://docs.python-requests.org/en/latest/) - *pip install requests* For XLS/XLSX support: -* [xlrd](https://github.com/python-excel/xlrd) - *pip install xlrd* \ No newline at end of file +* [xlrd](https://github.com/python-excel/xlrd) - *pip install xlrd* + +For Gmail support: +* [gmail](https://github.com/charlierguo/gmail) \ No newline at end of file diff --git a/iocp.py b/iocp.py index d9cc1b6..e7e8996 100755 --- a/iocp.py +++ b/iocp.py @@ -55,6 +55,12 @@ except ImportError: pass +try: + import gmail + IMPORTS.append('gmail') +except ImportError: + pass + try: from PyPDF2 import PdfFileReader IMPORTS.append('pypdf2') @@ -120,7 +126,11 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' raise ImportError(e) elif input_format == 'xlsx': if 'xlrd' not in IMPORTS: - e = 'XLRD Library not found. Please visit: https://github.com/python-excel/xlrd' + e = 'XLRD Library not found. Please visit: https://github.com/python-excel/xlrd or pip install xlrd' + raise ImportError(e) + elif input_format == 'gmail': + if 'gmail' not in IMPORTS: + e = 'Gmail library not found. Please visit: https://github.com/charlierguo/gmail' raise ImportError(e) def load_patterns(self, fpath): @@ -362,6 +372,52 @@ def parse_xlsx(self, f, fpath): self.handler.print_error(fpath, e) + def parse_gmail(self, username, password): + """ This method is used to parse the inbox of a valid + Gmail account. The flag used for this method to send to + output.py is 1. + + @author Robb Krasnow + @param username The gmail account's username + @param password The gmail account's password + """ + try: + if self.dedup: + self.dedup_store = set() + + # Log the user in + g = gmail.login(username, password) + + # When the user is logged in, grab all the email from their inbox + # and parse all the messages for IOCs + if g.logged_in: + print "***** Login Successful. *****\n" + + self.handler.print_header(username) + emails = g.inbox().mail() + + for email in range(0, len(emails)): + try: + emails[email].fetch() + content = emails[email].body + subject = re.sub('(^\s|re:\s+|\r\n|fwd:\s+)', '', emails[email].subject, flags=re.IGNORECASE) + + self.parse_page(subject, content, 1, 1) + except Exception as e: + continue + + self.handler.print_footer(username) + + print "\n***** %s emails found. *****" % len(emails) + g.logout() + print "***** Logout Successful. *****" + else: + sys.exit() + except gmail.exceptions.AuthenticationError: + print "Authentication Error" + sys.exit() + + def parse(self, path): try: if path.startswith('http://') or path.startswith('https://'): @@ -385,6 +441,15 @@ def parse(self, path): with open(fpath, 'rb') as f: self.parser_func(f, fpath) return + # Check if the input from CLI has @gmail.com attached + # If so, grab the credentials, and send them to parse_gmail() + elif path.count("@gmail.com ") == 1 and len(path.split()) == 2: + gmail_account = path.split() + username = gmail_account[0] + password = gmail_account[1] + self.parser_func(username, password) + + return e = 'File path is not a file, directory or URL: %s' % (path) raise IOError(e) @@ -395,9 +460,9 @@ def parse(self, path): if __name__ == "__main__": argparser = argparse.ArgumentParser() - argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)') + argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)/Gmail account in double quotes ("username@gmail.com password")') argparser.add_argument('-p', dest='INI', default=None, help='Pattern file') - argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/csv/xls/xlsx)') + argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html/csv/xls/xlsx/gmail)') argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)') argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') diff --git a/output.py b/output.py index e3491e3..b5f8525 100644 --- a/output.py +++ b/output.py @@ -40,15 +40,15 @@ def print_match(self, fpath, page, name, match, flag, sheet=''): def print_error(self, fpath, exception): self.csv_writer.writerow((fpath, '0', 'error', exception)) -class OutputHandler_json(OutputHandler): - """ - @param flag: - 0 = default (pdf/txt/html) - 2 = csv - 3 = xls and xlsx - @param sheet The sheet being parsed if Excel spreadsheet (single or multi-sheet) - """ +class OutputHandler_json(OutputHandler): def print_match(self, fpath, page, name, match, flag, sheet=''): + """ @param flag: + 0 = default (pdf/txt/html) + 1 = gmail + 2 = csv + 3 = xls and xlsx + @param sheet The sheet being parsed if Excel spreadsheet (single or multi-sheet) + """ if flag == 0 or flag == 2: data = { 'path' : fpath, @@ -57,6 +57,13 @@ def print_match(self, fpath, page, name, match, flag, sheet=''): 'type' : name, 'match': match } + elif flag == 1: + data = { + 'input' : 'gmail', + 'subject' : fpath, + 'type' : name, + 'match': match + } elif flag == 3: data = { 'path' : fpath, From 09b6dc3c331959d8eda0eade0f1669a34d133b18 Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Thu, 3 Mar 2016 10:56:18 -0600 Subject: [PATCH 05/11] Added MIT License for HPE --- LICENSE-HPE | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 LICENSE-HPE diff --git a/LICENSE-HPE b/LICENSE-HPE new file mode 100644 index 0000000..b2621fd --- /dev/null +++ b/LICENSE-HPE @@ -0,0 +1,25 @@ +Copyright for portions of project ioc-parser are held by [armbues, 2015] +as part of project ioc-parser. All other copyright for project ioc-parser +are held by [Hewlett Packard Enterprise Development LP, 2016]. + +The MIT License (MIT) + +(c) Copyright 2016 Hewlett Packard Enterprise Development LP + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From 3909e75fbc737b6141fb9be425c64712268075f9 Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Thu, 3 Mar 2016 11:23:05 -0600 Subject: [PATCH 06/11] Added HPE to License/Removed separate HPE License Simply added HPE Copyright to original license --- LICENSE | 6 +++--- LICENSE-HPE | 25 ------------------------- 2 files changed, 3 insertions(+), 28 deletions(-) delete mode 100644 LICENSE-HPE diff --git a/LICENSE b/LICENSE index c4588f7..ab150ce 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ The MIT License (MIT) -Copyright (c) 2015 armbues +Original work: Copyright (c) 2015 armbues +Additional work: (c) Copyright 2016 Hewlett Packard Enterprise Development LP Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,5 +19,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - +SOFTWARE. \ No newline at end of file diff --git a/LICENSE-HPE b/LICENSE-HPE deleted file mode 100644 index b2621fd..0000000 --- a/LICENSE-HPE +++ /dev/null @@ -1,25 +0,0 @@ -Copyright for portions of project ioc-parser are held by [armbues, 2015] -as part of project ioc-parser. All other copyright for project ioc-parser -are held by [Hewlett Packard Enterprise Development LP, 2016]. - -The MIT License (MIT) - -(c) Copyright 2016 Hewlett Packard Enterprise Development LP - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file From 183ba3fe24160ebf00db686f5fddd27a3e7cb5de Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Thu, 3 Mar 2016 11:37:49 -0600 Subject: [PATCH 07/11] Added comment for gmail flag --- iocp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/iocp.py b/iocp.py index e7e8996..7c3e334 100755 --- a/iocp.py +++ b/iocp.py @@ -174,6 +174,7 @@ def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''): @param page_num: the page number of a pdf, line number of csv, xls or xlsx @param flag: 0 = default (pdf/txt/html) + 1 = gmail 2 = csv 3 = xls and xlsx @param sheet_name: to be used only with Excel spreadsheets From 2d2898f041f636346928863dffb7d9dd0831362a Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Thu, 3 Mar 2016 11:39:59 -0600 Subject: [PATCH 08/11] Added sheet_name to comment --- iocp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iocp.py b/iocp.py index 7c3e334..3d83de2 100755 --- a/iocp.py +++ b/iocp.py @@ -198,7 +198,7 @@ def parse_page(self, fpath, data, page_num, flag=0, sheet_name=''): self.dedup_store.add((ind_type, ind_match)) - # Added flag to determine which type of output to display + # Added flag and sheet_name to determine which type of output to display self.handler.print_match(fpath, page_num, ind_type, ind_match, flag, sheet_name) def parse_pdf_pypdf2(self, f, fpath): From 755575f4c447d17f8b4a71e82d3cf9bea24dd54b Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Tue, 24 May 2016 12:14:09 -0500 Subject: [PATCH 09/11] Added proxy, fixed some syntax - Added the capability to use either http or https proxy when parsing websites - Fixed some syntax, changing " to ' for consistency - Added 'html.parser' for BeautifulSoup to supress warnings --- iocp.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/iocp.py b/iocp.py index 3d83de2..0063281 100755 --- a/iocp.py +++ b/iocp.py @@ -94,7 +94,7 @@ class IOC_Parser(object): patterns = {} defang = {} - def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None): + def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', proxy=None, output_handler=None): basedir = os.path.dirname(os.path.abspath(__file__)) if patterns_ini is None: patterns_ini = os.path.join(basedir, 'patterns.ini') @@ -102,6 +102,13 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' self.load_patterns(patterns_ini) self.whitelist = WhiteList(basedir) self.dedup = dedup + + # Depending on the type of proxy, set the proper proxy setting for storage to be used with Requests + if proxy.startswith('http://'): + self.proxy = {'http': proxy} + elif proxy.startswith('https://'): + self.proxy = {'https': proxy} + if output_handler: self.handler = output_handler else: @@ -281,7 +288,6 @@ def parse_html(self, f, fpath): self.dedup_store = set() data = f.read() - soup = BeautifulSoup(data) html = soup.findAll(text=True) text = u'' @@ -378,9 +384,9 @@ def parse_gmail(self, username, password): Gmail account. The flag used for this method to send to output.py is 1. - @author Robb Krasnow - @param username The gmail account's username - @param password The gmail account's password + @author Robb Krasnow + @param username The gmail account's username + @param password The gmail account's password """ try: if self.dedup: @@ -392,7 +398,7 @@ def parse_gmail(self, username, password): # When the user is logged in, grab all the email from their inbox # and parse all the messages for IOCs if g.logged_in: - print "***** Login Successful. *****\n" + print '***** Login Successful. *****\n' self.handler.print_header(username) emails = g.inbox().mail() @@ -409,13 +415,13 @@ def parse_gmail(self, username, password): self.handler.print_footer(username) - print "\n***** %s emails found. *****" % len(emails) + print '\n***** %s emails found. *****' % len(emails) g.logout() - print "***** Logout Successful. *****" + print '***** Logout Successful. *****' else: sys.exit() except gmail.exceptions.AuthenticationError: - print "Authentication Error" + print 'Authentication Error' sys.exit() @@ -426,8 +432,14 @@ def parse(self, path): e = 'HTTP library not found: requests' raise ImportError(e) headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' } - r = requests.get(path, headers=headers) - r.raise_for_status() + + # If using proxy, make request with proxy from --proxy switch + # Otherwise make the call normally + if self.proxy is not None: + r = requests.get(path, headers=headers, proxies=self.proxy) + else: + r = requests.get(path, headers=headers) + f = StringIO(r.content) self.parser_func(f, path) return @@ -444,7 +456,7 @@ def parse(self, path): return # Check if the input from CLI has @gmail.com attached # If so, grab the credentials, and send them to parse_gmail() - elif path.count("@gmail.com ") == 1 and len(path.split()) == 2: + elif path.count('@gmail.com ') == 1 and len(path.split()) == 2: gmail_account = path.split() username = gmail_account[0] password = gmail_account[1] @@ -467,7 +479,8 @@ def parse(self, path): argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)') argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') + argparser.add_argument('--proxy', dest='PROXY', default=None, help='Sets proxy (http(s)://server:port)') args = argparser.parse_args() - parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT) + parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT, args.PROXY) parser.parse(args.PATH) From cbef774f1c75cf75a33aa265a4d6a4a917a0110c Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Tue, 24 May 2016 12:17:24 -0500 Subject: [PATCH 10/11] Syntax Fix typo --- iocp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/iocp.py b/iocp.py index 0063281..0294b2a 100755 --- a/iocp.py +++ b/iocp.py @@ -288,6 +288,7 @@ def parse_html(self, f, fpath): self.dedup_store = set() data = f.read() + soup = BeautifulSoup(data, 'html.parser') # Add "html.parser" to supress user warning html = soup.findAll(text=True) text = u'' From 2838704427511f7ba3c85c37c69c6e32beee67ef Mon Sep 17 00:00:00 2001 From: Robb Krasnow Date: Wed, 25 May 2016 12:44:08 -0500 Subject: [PATCH 11/11] Fix bug if no proxy supplied If no proxy is supplied, error was thrown. This is fixed and requests can optionally use a proxy or not. --- iocp.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/iocp.py b/iocp.py index 0294b2a..b91fad1 100755 --- a/iocp.py +++ b/iocp.py @@ -104,10 +104,13 @@ def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library=' self.dedup = dedup # Depending on the type of proxy, set the proper proxy setting for storage to be used with Requests - if proxy.startswith('http://'): - self.proxy = {'http': proxy} - elif proxy.startswith('https://'): - self.proxy = {'https': proxy} + if proxy is not None: + if proxy.startswith('http://'): + self.proxy = {'http': proxy} + elif proxy.startswith('https://'): + self.proxy = {'https': proxy} + else: + self.proxy = proxy if output_handler: self.handler = output_handler