Skip to content

Commit

Permalink
Merge pull request #151 from trigaten/fix-bugs
Browse files Browse the repository at this point in the history
Fix-bugs and acl change
  • Loading branch information
Mcilie committed Jan 16, 2024
2 parents f6741a6 + 4599b64 commit 9d54353
Show file tree
Hide file tree
Showing 31 changed files with 30,524 additions and 196 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.9"
python-version: "3.10"
- run: pip install pre-commit -r requirements.txt
- run: pre-commit run --all-files
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ filtered_arxiv_papers.csv
scripts/arxiv_papers_with_abstract.csv
scripts/arxiv_papers_with_ai_labels.csv
papers_output/*
data/arxiv_papers_for_human_review.csv
papers
scripts/master_papers.csv
scripts/t.py
Expand All @@ -29,4 +28,5 @@ data/topic-model-data/detected-phrases
data/topic-model-data/processed
data/topic-model-data/topic-model-outputs
data/topic-model-data/master_papers.csv
data/SCS-training-data/
data/experiments_output
data/SCS-training-data/
16 changes: 0 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,6 @@

after cloning, run `pip install -r requirements.txt` from root

also you need to have ACL anthology library installed on your system

```
git clone https://github.com/acl-org/acl-anthology
export ACLANTHOLOGY=$(pwd)/acl-anthology
export PYTHONPATH=$ACLANTHOLOGY/bin:$PYTHONPATH
```
afterwards follow the complete install instructions on the acl anthology repo

or alternatively, after you git clone you can add it to the python path using the `sys` module
```
import sys
sys.path.append('/path/to/acl-anthology/bin')
```


## Set up API keys

Make a file at root called `.env`.
Expand Down
30,294 changes: 30,294 additions & 0 deletions data/arxiv_papers_for_human_review.csv

Large diffs are not rendered by default.

13 changes: 0 additions & 13 deletions data/topic-gpt-data/README.md

This file was deleted.

58 changes: 0 additions & 58 deletions data/topic-model-data/README.md

This file was deleted.

20 changes: 0 additions & 20 deletions data/topic-model-data/unused_pyproject.txt

This file was deleted.

9 changes: 7 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
from prompt_systematic_review import collect_papers
from prompt_systematic_review import config_data


config_data.DataFolderPath = "./data"
config_data.DotenvPath = "./.env"
if not config_data.hasDownloadedPapers:
collect_papers()
collect_papers.collect()
config_data.hasDownloadedPapers = True


from prompt_systematic_review import experiments
import os

os.makedirs(config_data.DataFolderPath + os.sep + "experiments_output", exist_ok=True)
print("Running experiments...")
for experiment in experiments.experiments:
experiment.run()

print("Experiments completed. See data/experiments_output for output files")
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ sentence_transformers==2.2.2
tenacity==8.2.3
tiktoken==0.5.1
anytree==2.12.1
urllib3<2
acl-anthology-py
seaborn
-e .
18 changes: 16 additions & 2 deletions src/prompt_systematic_review/collect_papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,19 @@

import pandas as pd
import PyPDF2
from PyPDF2.errors import PdfReadError
from prompt_systematic_review.utils.utils import process_paper_title

import openai
import tqdm
from dotenv import load_dotenv
import logging

# don't want to see warning messages when users are running
pdflogger = logging.getLogger("PyPDF2")
pdflogger.setLevel(logging.ERROR)
urlLogger = logging.getLogger("urllib3")
urlLogger.setLevel(logging.ERROR)

load_dotenv(dotenv_path=DotenvPath) # load all entries from .env file

Expand Down Expand Up @@ -63,6 +71,7 @@ def collect():
# clean ACL CSV
acl_df["title"] = acl_df["title"].apply(lambda x: process_paper_title(x))
acl_df["source"] = "ACL"

# combine dfs
combined_df = pd.concat([semantic_scholar_df, arxiv_df, acl_df])
# drop duplicates
Expand All @@ -76,6 +85,9 @@ def collect():

data = list(zip(deduplicated_df["url"].tolist(), deduplicated_df["title"].tolist()))

# make papers folder if it doesn't already exist
os.makedirs(os.path.join(DataFolderPath, "papers"), exist_ok=True)

NUM_PROCESSES = 12 # adjust as needed per your machine
with ThreadPoolExecutor(max_workers=NUM_PROCESSES) as executor:
executor.map(lambda p: downloadPaper(*p), data)
Expand Down Expand Up @@ -113,7 +125,9 @@ def collect():
os.remove(file_path)
# Drop the corresponding row from the dataframe
deduplicated_df = deduplicated_df[deduplicated_df["title"] != filename[:-4]]
print(f"Error processing {filename}: {e}")
# PDFRead Error is likely because of corrupted or empty PDF, can be ignored
if str(e) != "EOF marker not found":
print(f"Error processing {filename}: {e}")
# TODO: there is smtg weird going on here...

# Get a list of all the paper titles in the directory (without the .pdf extension)
Expand Down Expand Up @@ -187,5 +201,5 @@ def collect():
# Check if the file is a PDF and its title is not in df_titles
if filename.endswith(".pdf") and filename[:-4] not in df_titles:
# Remove the file
os.remove("papers/" + filename)
os.remove(DataFolderPath + os.sep + "papers" + os.sep + filename)
df_combined.to_csv(os.path.join(DataFolderPath, "master_papers.csv"))
2 changes: 1 addition & 1 deletion src/prompt_systematic_review/config_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

DataFolderPath = os.path.abspath("./data")
DotenvPath = os.path.abspath("./.env")
hasDownloadedPapers = True
hasDownloadedPapers = False


def setDownloadedPapers(hasDownloadedPapers):
Expand Down
3 changes: 2 additions & 1 deletion src/prompt_systematic_review/experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from . import graph_gpt_4_benchmarks200
from . import graph_gpt_3_5_benchmarks
from . import run_tomotopy
from . import topicgpt


experiments = [
count_models.Experiment,
count_tool_mentions.Experiment,
eval_prompts.Experiment,
evaluate_human_agreement.Experiment,
Expand All @@ -27,4 +27,5 @@
graph_gpt_4_benchmarks200.Experiment,
graph_gpt_3_5_benchmarks.Experiment,
run_tomotopy.Experiment,
topicgpt.Experiment,
]
2 changes: 1 addition & 1 deletion src/prompt_systematic_review/experiments/count_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def count_model_mentions(folder_path):

output_file_path = os.path.join(DataFolderPath, "model_citation_counts.csv")

with open(output_file_path, "w") as f:
with open(output_file_path, "w", encoding="utf-8") as f:
fieldnames = ["model_name", "count", "list_of_papers"]

# Create a CSV writer object
Expand Down
18 changes: 13 additions & 5 deletions src/prompt_systematic_review/experiments/count_tool_mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def count_tool_mentions(input_folder_path: str, output_file_path: str, tool_lst:

print("tool_counts: ", tool_counts)

with open(output_file_path, "w") as f:
with open(output_file_path, "w", encoding="utf-8") as f:
fieldnames = ["tool_name", "count", "list_of_papers"]

# Create a CSV writer object
Expand All @@ -55,7 +55,7 @@ def count_tool_mentions(input_folder_path: str, output_file_path: str, tool_lst:

def run_count_tool_mentions():
# script portion
masterpaperscsv_file_path = DataFolderPath
masterpaperscsv_file_path = os.path.join(DataFolderPath, "master_papers.csv")

# get all paper ids from our dataset
arxiv_papers_df = pd.read_csv(masterpaperscsv_file_path)
Expand Down Expand Up @@ -103,7 +103,9 @@ def run_count_tool_mentions():
]
count_tool_mentions(
papers_dataset_path,
os.path.join(DataFolderPath, "model_citation_counts.csv"),
os.path.join(
DataFolderPath, "experiments_output" + os.sep + "model_citation_counts.csv"
),
model_names,
)

Expand All @@ -123,7 +125,10 @@ def run_count_tool_mentions():
]
count_tool_mentions(
papers_dataset_path,
os.path.join(DataFolderPath, "dataset_citation_counts.csv"),
os.path.join(
DataFolderPath,
"experiments_output" + os.sep + "dataset_citation_counts.csv",
),
dataset_names,
)

Expand Down Expand Up @@ -159,7 +164,10 @@ def run_count_tool_mentions():
]
count_tool_mentions(
papers_dataset_path,
os.path.join(DataFolderPath, "framework_citation_counts.csv"),
os.path.join(
DataFolderPath,
"experiments_output" + os.sep + "framework_citation_counts.csv",
),
framework_names,
)

Expand Down
5 changes: 4 additions & 1 deletion src/prompt_systematic_review/experiments/eval_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def eval_prompts():
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# File path for the JSON file
file_path = os.path.join(DataFolderPath, "RP_eval_results_{current_datetime}.json")
file_path = os.path.join(
DataFolderPath,
"experiments_output" + os.sep + f"RP_eval_results_{current_datetime}.json",
)

# Writing the dictionary to a JSON file
with open(file_path, "w") as json_file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,19 @@ def evaluate_human_agreement(inputFile="arxiv_papers_with_abstract.csv"):
df.loc[i, "Probability"] = result["Probability"]
df.loc[i, "Reasoning"] = result["Reasoning"]

df.to_csv(os.path.join(DataFolderPath, "arxiv_papers_with_ai_labels.csv"))
df.to_csv(
os.path.join(
DataFolderPath,
"experiments_output" + os.sep + "arxiv_papers_with_ai_labels.csv",
)
)
blacklist = pd.read_csv(os.path.join(DataFolderPath, "blacklist.csv"))
blacklist["Title"] = blacklist["Title"].apply(lambda x: process_paper_title(x))
blacklist["title"] = blacklist["title"].apply(lambda x: process_paper_title(x))
df["title"] = df["title"].apply(lambda x: process_paper_title(x))

# df = df.iloc[400:800]
df_limited = df.copy() # .iloc[400:800]
df_limited["human_review"] = ~df_limited["title"].isin(blacklist["Title"])
df_limited["human_review"] = ~df_limited["title"].isin(blacklist["title"])
keepables = ["highly relevant", "somewhat relevant", "neutral"]

df_limited["AI_keep"] = df_limited["Probability"].map(
Expand Down
9 changes: 8 additions & 1 deletion src/prompt_systematic_review/experiments/generation_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import argparse
import os

# This file is used in tandem with topicgpt. topicgpt.py is the file that runs the functions in this file.

os.environ["TOKENIZERS_PARALLELISM"] = "false"


Expand Down Expand Up @@ -112,6 +114,9 @@ def generate_topics(
running_dups = 0
topic_format = regex.compile("^\[(\d+)\] ([\w\s]+):(.+)")

# Setup client
client = client_setup()

for i, doc in enumerate(tqdm(docs)):
prompt = prompt_formatting(
generation_prompt,
Expand All @@ -123,7 +128,9 @@ def generate_topics(
verbose,
)
try:
response = api_call(prompt, deployment_name, temperature, max_tokens, top_p)
response = api_call(
prompt, deployment_name, temperature, max_tokens, top_p, client
)
topics = response.split("\n")
for t in topics:
t = t.strip()
Expand Down
Loading

0 comments on commit 9d54353

Please sign in to comment.