add general journal parser #15

USCDataScience · Apr 23, 2021 · a698dd4 · a698dd4
1 parent bea046b
commit a698dd4
Showing 1 changed file with 128 additions and 0 deletions.
diff --git a/src/parserindexer/journal_parser.py b/src/parserindexer/journal_parser.py
@@ -0,0 +1,128 @@
+from __future__ import print_function
+
+import re
+import sys
+import json
+from parser import Parser
+from ioutils import read_lines
+from tika_parser import TikaParser
+from jsre_parser import JsreParser
+from brat_ann_indexer import extract_references
+
+
+class JournalParser(Parser):
+    """ The JournalParser removes special text/format for journal articles """
+    PDF_TYPE = "application/pdf"
+
+    def __init__(self):
+        super(JournalParser, self).__init__('journal_parser')
+
+    def parse(self, text, metadata):
+        if type(metadata['Content-Type']) == list:
+            assert JournalParser.PDF_TYPE in metadata['Content-Type']
+        else:
+            assert metadata['Content-Type'] == JournalParser.PDF_TYPE
+
+        assert type(text) == str or type(text) == unicode
+
+        # 1. Translate some UTF-8 punctuation to ASCII
+        punc = {
+            # single quote
+            0x2018: 0x27, 0x2019: 0x27,
+            # double quote
+            0x201C: 0x22, 0x201D: 0x22,
+            # hyphens
+            0x2010: 0x2d, 0x2011: 0x2d, 0x2012: 0x2d, 0x2013: 0x2d,
+            # comma
+            0xFF0C: 0x2c,
+            # space
+            0x00A0: 0x20,
+            # bullets
+            0x2219: 0x2e, 0x2022: 0x2e,
+        }
+        text = text.translate(punc)
+
+        # 2. Replace newlines that separate words with a space (unless hyphen)
+        text = re.sub(r'([^\s-])[\r|\n]+([^\s])', '\\1 \\2', text)
+
+        # 3. Remove hyphenation at the end of lines
+        # (this is sometimes bad, as with "Fe-\nrich")
+        text = text.replace('-\n', '\n')
+
+        # 4. Remove all newlines
+        text = re.sub(r'[\r|\n]+', '', text)
+
+        # 5. Move references to their own field (references)
+        refs = extract_references(text)
+        for ref_id in refs:  # preserve length; insert whitespace
+            text = text.replace(refs[ref_id], ' ' * len(refs[ref_id]))
+
+        return {
+            'references': refs.values(),
+            'cleaned_content': text
+        }
+
+
+def process(in_file, in_list, out_file, tika_server_url, corenlp_server_url,
+            ner_model, jsre_root, jsre_model, jsre_tmp_dir):
+    if in_file and in_list:
+        print('[ERROR] in_file and in_list cannot be provided simultaneously')
+        sys.exit(1)
+
+    tika_parser = TikaParser(tika_server_url)
+    journal_parser = JournalParser()
+    jsre_parser = JsreParser(corenlp_server_url, ner_model, jsre_root,
+                             jsre_model, jsre_tmp_dir)
+
+    if in_file:
+        files = [in_file]
+    else:
+        files = read_lines(in_list)
+
+    out_f = open(out_file, 'wb', 1)
+    for f in files:
+        tika_dict = tika_parser.parser(f)
+        journal_dict = journal_parser.parse(tika_dict['content'],
+                                            tika_dict['metadata'])
+        jsre_dict = jsre_parser.parse(journal_dict['cleaned_content'])
+
+        tika_dict['content_ann_s'] = journal_dict['cleaned_content']
+        tika_dict['references'] = journal_dict['references']
+        tika_dict['metadata']['ner'] = jsre_dict['ner']
+        tika_dict['metadata']['rel'] = jsre_dict['relation']
+        tika_dict['metadata']['sentences'] = jsre_dict['sentences']
+        tika_dict['metadata']['X-Parsed-By'] = jsre_dict['X-Parsed-By']
+
+        out_f.write(json.dumps(tika_dict))
+        out_f.write('\n')
+
+    out_f.close()
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    input_parser = parser.add_mutually_exclusive_group(required=True)
+
+    input_parser.add_argument('-i', '--in_file', help='Path to input file')
+    input_parser.add_argument('-li', '--in_list', help='Path to input list')
+    parser.add_argument('-o', '--out_file', required=True,
+                        help='Path to output JSON file')
+    parser.add_argument('-p', '--tika_server_url', required=False,
+                        help='Tika server URL')
+    parser.add_argument('-c', '--corenlp_server_url',
+                        default='"http://localhost:9000',
+                        help='CoreNLP Server URL')
+    parser.add_argument('-n', '--ner_model', required=False,
+                        help='Path to a Named Entity Recognition (NER) model')
+    parser.add_argument('-jr', '--jsre_root', default='/proj/mte/jSRE/jsre-1.1',
+                        help='Path to jSRE installation directory. Default is '
+                             '/proj/mte/jSRE/jsre-1.1')
+    parser.add_argument('-jm', '--jsre_model', required=True,
+                        help='Path to jSRE model')
+    parser.add_argument('-jt', '--jsre_tmp_dir', default='/tmp',
+                        help='Path to a directory for jSRE to temporarily '
+                             'store input and output files. Default is /tmp')
+    args = parser.parse_args()
+    process(**vars(args))