From 2bc1a39b8b70b4bdf201a13bbab6d7dddd42f48a Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 11 Apr 2024 06:04:57 +0000 Subject: [PATCH] Seach google and google scholar, add holocamera test --- DuBibtex.py | 21 ++++++++++++++------- tests/inputs/holocamera.bib | 10 ++++++++++ tests/test_find_dois.py | 24 ++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 7 deletions(-) create mode 100644 tests/inputs/holocamera.bib diff --git a/DuBibtex.py b/DuBibtex.py index adaea3d..83d307e 100644 --- a/DuBibtex.py +++ b/DuBibtex.py @@ -66,7 +66,7 @@ class Re: acm = re.compile('citation\.cfm\?id\=([\d\.]+)', flags=re.MULTILINE) acmBib = re.compile('
(.+)<\/pre>',
                       flags=re.MULTILINE | re.IGNORECASE | re.S)
-  ieee = re.compile('ieee\.org\/document\/(\d+)', flags=re.MULTILINE)
+  ieee = re.compile('ieee\.org(?:\/abstract)?\/document\/(\d+)', flags=re.MULTILINE)
   year = re.compile('\w+(\d+)')
 
 
@@ -191,17 +191,21 @@ def write_current_item(self):
       # Searches for DOI.
       self.debug_bib('Missing DOI, search "%s"...' % self.cur['title'])
 
+      title_without_brackets = re.sub(r'\{|\}', '', self.cur['title'])
       if 'journal' in self.cur and self.cur['journal'][:5].lower() == 'arxiv':
         content = request_url('https://www.google.com/search?q=%s' %
-                              self.cur['title'])
+                              title_without_brackets)
         m = Re.urlArxiv.search(content)
         if m and len(m.groups()) > 0:
           self.cur['url'] = "https://arxiv.org/pdf/%s" % m.groups()[0]
-          self.debug_bib('Missing DOI, search "%s"...' % self.cur['title'])
+          self.debug_bib('Missing DOI, search "%s"...' % title_without_brackets)
       else:
-        d = google_lookup(self.cur['title'], self)
+        d = google_lookup(title_without_brackets, self)
         if not d:
-          d = crossref_lookup(self.cur['title'])
+          # Try again with google scholar.
+          d = google_lookup(title_without_brackets, self, use_scholar=True)
+        if not d:
+          d = crossref_lookup(title_without_brackets)
         if d:
           self.fix_doi(d)
         else:
@@ -338,8 +342,11 @@ def levenshtein(s1, s2):
   return previous_row[-1]
 
 
-def google_lookup(s, parser):
-  html = request_url('https://www.google.com/search?q=%s' % s)
+def google_lookup(s, parser, use_scholar=False):
+  if use_scholar:
+    html = request_url('https://scholar.google.com/scholar?q=%s' % s)
+  else:
+    html = request_url('https://www.google.com/search?q=%s' % s)
   with open('debug.txt', 'w', encoding='utf8') as f:
     f.write(html)
 
diff --git a/tests/inputs/holocamera.bib b/tests/inputs/holocamera.bib
new file mode 100644
index 0000000..96a8cf0
--- /dev/null
+++ b/tests/inputs/holocamera.bib
@@ -0,0 +1,10 @@
+@article{holocamera,
+  author={Heagerty, Jonathan and Li, Sida and Lee, Eric and Bhattacharyya, Shuvra and Bista, Sujal and Brawn, Barbara and Feng, Brandon and Jabbireddy, Susmija and JaJa, Joseph and Kacorri, Hernisa and Li, David and Yarnell, Derek and Zwicker, Matthias and Varshney, Amitabh},
+  journal={IEEE Transactions on Visualization and Computer Graphics}, 
+  title={{HoloCamera}: Advanced Volumetric Capture for Cinematic-Quality VR Applications},  
+  year={2024},
+  volume={},
+  number={},
+  pages={},
+  keywords={Volumetric Capture, Light Fields, Holoportation, Multi-camera Array}
+  }
\ No newline at end of file
diff --git a/tests/test_find_dois.py b/tests/test_find_dois.py
index c0d4d0c..bee775b 100644
--- a/tests/test_find_dois.py
+++ b/tests/test_find_dois.py
@@ -31,3 +31,27 @@ def test_iccv_doi(filename, correct_doi):
         for entry in generated_library.entries:
             assert "doi" in entry
             assert entry.fields_dict["doi"].value == correct_doi
+
+@pytest.mark.parametrize("filename,correct_doi", [
+    ("holocamera.bib", "10.1109/TVCG.2024.3372123"),
+])
+def test_tvcg_doi(filename, correct_doi):
+    input_file = os.path.join("tests", "inputs", filename)
+
+    assert os.path.isfile(input_file), f"File {input_file} does not exist"
+    library = bibtexparser.parse_file(input_file)
+
+    with tempfile.NamedTemporaryFile() as fp:
+        p = Parser(output_file=fp.name)
+        for entry in library.entries:
+            p.copy_from_parsed_entry(entry)
+            p.write_current_item()
+        p.shut_down()
+
+        # Check the doi
+        generated_library = bibtexparser.parse_file(fp.name)
+        assert len(generated_library.entries) == len(
+            library.entries), "Number of entries should be the same"
+        for entry in generated_library.entries:
+            assert "doi" in entry
+            assert entry.fields_dict["doi"].value == correct_doi
\ No newline at end of file