Merge pull request #151 from trigaten/fix-bugs

Fix-bugs and acl change
trigaten · Jan 16, 2024 · 9d54353 · 9d54353
2 parents f6741a6 + 4599b64
commit 9d54353
Show file tree

Hide file tree

Showing 31 changed files with 30,524 additions and 196 deletions.
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -12,6 +12,6 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.9"
+          python-version: "3.10"
       - run: pip install pre-commit -r requirements.txt
       - run: pre-commit run --all-files
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,6 @@ filtered_arxiv_papers.csv
 scripts/arxiv_papers_with_abstract.csv
 scripts/arxiv_papers_with_ai_labels.csv
 papers_output/*
-data/arxiv_papers_for_human_review.csv
 papers
 scripts/master_papers.csv
 scripts/t.py
@@ -29,4 +28,5 @@ data/topic-model-data/detected-phrases
 data/topic-model-data/processed
 data/topic-model-data/topic-model-outputs
 data/topic-model-data/master_papers.csv
-data/SCS-training-data/
+data/experiments_output
+data/SCS-training-data/
diff --git a/README.md b/README.md
@@ -4,22 +4,6 @@
 
 after cloning, run `pip install -r requirements.txt` from root
 
-also you need to have ACL anthology library installed on your system
-
-```
-git clone https://github.com/acl-org/acl-anthology
-export ACLANTHOLOGY=$(pwd)/acl-anthology
-export PYTHONPATH=$ACLANTHOLOGY/bin:$PYTHONPATH
-```
-afterwards follow the complete install instructions on the acl anthology repo
-
-or alternatively, after you git clone you can add it to the python path using the `sys` module
-```
-import sys
-sys.path.append('/path/to/acl-anthology/bin')
-```
-
-
 ## Set up API keys
 
 Make a file at root called `.env`.

diff --git a/data/arxiv_papers_for_human_review.csv b/data/arxiv_papers_for_human_review.csv
diff --git a/data/topic-gpt-data/README.md b/data/topic-gpt-data/README.md
diff --git a/data/topic-model-data/README.md b/data/topic-model-data/README.md
diff --git a/data/topic-model-data/unused_pyproject.txt b/data/topic-model-data/unused_pyproject.txt
diff --git a/main.py b/main.py
@@ -1,14 +1,19 @@
 from prompt_systematic_review import collect_papers
 from prompt_systematic_review import config_data
 
-
 config_data.DataFolderPath = "./data"
 config_data.DotenvPath = "./.env"
 if not config_data.hasDownloadedPapers:
-    collect_papers()
+    collect_papers.collect()
+    config_data.hasDownloadedPapers = True
 
 
 from prompt_systematic_review import experiments
+import os
 
+os.makedirs(config_data.DataFolderPath + os.sep + "experiments_output", exist_ok=True)
+print("Running experiments...")
 for experiment in experiments.experiments:
     experiment.run()
+
+print("Experiments completed. See data/experiments_output for output files")
diff --git a/requirements.txt b/requirements.txt
@@ -29,4 +29,7 @@ sentence_transformers==2.2.2
 tenacity==8.2.3
 tiktoken==0.5.1
 anytree==2.12.1
+urllib3<2
+acl-anthology-py
+seaborn
 -e .
diff --git a/src/prompt_systematic_review/collect_papers.py b/src/prompt_systematic_review/collect_papers.py
@@ -13,11 +13,19 @@
 
 import pandas as pd
 import PyPDF2
+from PyPDF2.errors import PdfReadError
 from prompt_systematic_review.utils.utils import process_paper_title
 
 import openai
 import tqdm
 from dotenv import load_dotenv
+import logging
+
+# don't want to see warning messages when users are running
+pdflogger = logging.getLogger("PyPDF2")
+pdflogger.setLevel(logging.ERROR)
+urlLogger = logging.getLogger("urllib3")
+urlLogger.setLevel(logging.ERROR)
 
 load_dotenv(dotenv_path=DotenvPath)  # load all entries from .env file
 
@@ -63,6 +71,7 @@ def collect():
     # clean ACL CSV
     acl_df["title"] = acl_df["title"].apply(lambda x: process_paper_title(x))
     acl_df["source"] = "ACL"
+
     # combine dfs
     combined_df = pd.concat([semantic_scholar_df, arxiv_df, acl_df])
     # drop duplicates
@@ -76,6 +85,9 @@ def collect():
 
     data = list(zip(deduplicated_df["url"].tolist(), deduplicated_df["title"].tolist()))
 
+    # make papers folder if it doesn't already exist
+    os.makedirs(os.path.join(DataFolderPath, "papers"), exist_ok=True)
+
     NUM_PROCESSES = 12  # adjust as needed per your machine
     with ThreadPoolExecutor(max_workers=NUM_PROCESSES) as executor:
         executor.map(lambda p: downloadPaper(*p), data)
@@ -113,7 +125,9 @@ def collect():
             os.remove(file_path)
             # Drop the corresponding row from the dataframe
             deduplicated_df = deduplicated_df[deduplicated_df["title"] != filename[:-4]]
-            print(f"Error processing {filename}: {e}")
+            # PDFRead Error is likely because of corrupted or empty PDF, can be ignored
+            if str(e) != "EOF marker not found":
+                print(f"Error processing {filename}: {e}")
     # TODO: there is smtg weird going on here...
 
     # Get a list of all the paper titles in the directory (without the .pdf extension)
@@ -187,5 +201,5 @@ def collect():
         # Check if the file is a PDF and its title is not in df_titles
         if filename.endswith(".pdf") and filename[:-4] not in df_titles:
             # Remove the file
-            os.remove("papers/" + filename)
+            os.remove(DataFolderPath + os.sep + "papers" + os.sep + filename)
     df_combined.to_csv(os.path.join(DataFolderPath, "master_papers.csv"))
diff --git a/src/prompt_systematic_review/config_data.py b/src/prompt_systematic_review/config_data.py
@@ -3,7 +3,7 @@
 
 DataFolderPath = os.path.abspath("./data")
 DotenvPath = os.path.abspath("./.env")
-hasDownloadedPapers = True
+hasDownloadedPapers = False
 
 
 def setDownloadedPapers(hasDownloadedPapers):

diff --git a/src/prompt_systematic_review/experiments/__init__.py b/src/prompt_systematic_review/experiments/__init__.py
@@ -11,10 +11,10 @@
 from . import graph_gpt_4_benchmarks200
 from . import graph_gpt_3_5_benchmarks
 from . import run_tomotopy
+from . import topicgpt
 
 
 experiments = [
-    count_models.Experiment,
     count_tool_mentions.Experiment,
     eval_prompts.Experiment,
     evaluate_human_agreement.Experiment,
@@ -27,4 +27,5 @@
     graph_gpt_4_benchmarks200.Experiment,
     graph_gpt_3_5_benchmarks.Experiment,
     run_tomotopy.Experiment,
+    topicgpt.Experiment,
 ]
diff --git a/src/prompt_systematic_review/experiments/count_models.py b/src/prompt_systematic_review/experiments/count_models.py
@@ -89,7 +89,7 @@ def count_model_mentions(folder_path):
 
     output_file_path = os.path.join(DataFolderPath, "model_citation_counts.csv")
 
-    with open(output_file_path, "w") as f:
+    with open(output_file_path, "w", encoding="utf-8") as f:
         fieldnames = ["model_name", "count", "list_of_papers"]
 
         # Create a CSV writer object

diff --git a/src/prompt_systematic_review/experiments/count_tool_mentions.py b/src/prompt_systematic_review/experiments/count_tool_mentions.py
@@ -37,7 +37,7 @@ def count_tool_mentions(input_folder_path: str, output_file_path: str, tool_lst:
 
     print("tool_counts: ", tool_counts)
 
-    with open(output_file_path, "w") as f:
+    with open(output_file_path, "w", encoding="utf-8") as f:
         fieldnames = ["tool_name", "count", "list_of_papers"]
 
         # Create a CSV writer object
@@ -55,7 +55,7 @@ def count_tool_mentions(input_folder_path: str, output_file_path: str, tool_lst:
 
 def run_count_tool_mentions():
     # script portion
-    masterpaperscsv_file_path = DataFolderPath
+    masterpaperscsv_file_path = os.path.join(DataFolderPath, "master_papers.csv")
 
     # get all paper ids from our dataset
     arxiv_papers_df = pd.read_csv(masterpaperscsv_file_path)
@@ -103,7 +103,9 @@ def run_count_tool_mentions():
     ]
     count_tool_mentions(
         papers_dataset_path,
-        os.path.join(DataFolderPath, "model_citation_counts.csv"),
+        os.path.join(
+            DataFolderPath, "experiments_output" + os.sep + "model_citation_counts.csv"
+        ),
         model_names,
     )
 
@@ -123,7 +125,10 @@ def run_count_tool_mentions():
     ]
     count_tool_mentions(
         papers_dataset_path,
-        os.path.join(DataFolderPath, "dataset_citation_counts.csv"),
+        os.path.join(
+            DataFolderPath,
+            "experiments_output" + os.sep + "dataset_citation_counts.csv",
+        ),
         dataset_names,
     )
 
@@ -159,7 +164,10 @@ def run_count_tool_mentions():
     ]
     count_tool_mentions(
         papers_dataset_path,
-        os.path.join(DataFolderPath, "framework_citation_counts.csv"),
+        os.path.join(
+            DataFolderPath,
+            "experiments_output" + os.sep + "framework_citation_counts.csv",
+        ),
         framework_names,
     )
 

diff --git a/src/prompt_systematic_review/experiments/eval_prompts.py b/src/prompt_systematic_review/experiments/eval_prompts.py
@@ -49,7 +49,10 @@ def eval_prompts():
     current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 
     # File path for the JSON file
-    file_path = os.path.join(DataFolderPath, "RP_eval_results_{current_datetime}.json")
+    file_path = os.path.join(
+        DataFolderPath,
+        "experiments_output" + os.sep + f"RP_eval_results_{current_datetime}.json",
+    )
 
     # Writing the dictionary to a JSON file
     with open(file_path, "w") as json_file:

diff --git a/src/prompt_systematic_review/experiments/evaluate_human_agreement.py b/src/prompt_systematic_review/experiments/evaluate_human_agreement.py
@@ -33,14 +33,19 @@ def evaluate_human_agreement(inputFile="arxiv_papers_with_abstract.csv"):
         df.loc[i, "Probability"] = result["Probability"]
         df.loc[i, "Reasoning"] = result["Reasoning"]
 
-    df.to_csv(os.path.join(DataFolderPath, "arxiv_papers_with_ai_labels.csv"))
+    df.to_csv(
+        os.path.join(
+            DataFolderPath,
+            "experiments_output" + os.sep + "arxiv_papers_with_ai_labels.csv",
+        )
+    )
     blacklist = pd.read_csv(os.path.join(DataFolderPath, "blacklist.csv"))
-    blacklist["Title"] = blacklist["Title"].apply(lambda x: process_paper_title(x))
+    blacklist["title"] = blacklist["title"].apply(lambda x: process_paper_title(x))
     df["title"] = df["title"].apply(lambda x: process_paper_title(x))
 
     # df = df.iloc[400:800]
     df_limited = df.copy()  # .iloc[400:800]
-    df_limited["human_review"] = ~df_limited["title"].isin(blacklist["Title"])
+    df_limited["human_review"] = ~df_limited["title"].isin(blacklist["title"])
     keepables = ["highly relevant", "somewhat relevant", "neutral"]
 
     df_limited["AI_keep"] = df_limited["Probability"].map(

diff --git a/src/prompt_systematic_review/experiments/generation_1.py b/src/prompt_systematic_review/experiments/generation_1.py
@@ -7,6 +7,8 @@
 import argparse
 import os
 
+# This file is used in tandem with topicgpt. topicgpt.py is the file that runs the functions in this file.
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
@@ -112,6 +114,9 @@ def generate_topics(
     running_dups = 0
     topic_format = regex.compile("^\[(\d+)\] ([\w\s]+):(.+)")
 
+    # Setup client
+    client = client_setup()
+
     for i, doc in enumerate(tqdm(docs)):
         prompt = prompt_formatting(
             generation_prompt,
@@ -123,7 +128,9 @@ def generate_topics(
             verbose,
         )
         try:
-            response = api_call(prompt, deployment_name, temperature, max_tokens, top_p)
+            response = api_call(
+                prompt, deployment_name, temperature, max_tokens, top_p, client
+            )
             topics = response.split("\n")
             for t in topics:
                 t = t.strip()