Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix-bugs and acl change #151

Merged
merged 9 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.9"
python-version: "3.10"
- run: pip install pre-commit -r requirements.txt
- run: pre-commit run --all-files
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ filtered_arxiv_papers.csv
scripts/arxiv_papers_with_abstract.csv
scripts/arxiv_papers_with_ai_labels.csv
papers_output/*
data/arxiv_papers_for_human_review.csv
papers
scripts/master_papers.csv
scripts/t.py
Expand All @@ -29,4 +28,5 @@ data/topic-model-data/detected-phrases
data/topic-model-data/processed
data/topic-model-data/topic-model-outputs
data/topic-model-data/master_papers.csv
data/SCS-training-data/
data/experiments_output
data/SCS-training-data/
16 changes: 0 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,6 @@

after cloning, run `pip install -r requirements.txt` from root

also you need to have ACL anthology library installed on your system

```
git clone https://github.com/acl-org/acl-anthology
export ACLANTHOLOGY=$(pwd)/acl-anthology
export PYTHONPATH=$ACLANTHOLOGY/bin:$PYTHONPATH
```
afterwards follow the complete install instructions on the acl anthology repo

or alternatively, after you git clone you can add it to the python path using the `sys` module
```
import sys
sys.path.append('/path/to/acl-anthology/bin')
```


## Set up API keys

Make a file at root called `.env`.
Expand Down
30,294 changes: 30,294 additions & 0 deletions data/arxiv_papers_for_human_review.csv

Large diffs are not rendered by default.

13 changes: 0 additions & 13 deletions data/topic-gpt-data/README.md

This file was deleted.

58 changes: 0 additions & 58 deletions data/topic-model-data/README.md

This file was deleted.

20 changes: 0 additions & 20 deletions data/topic-model-data/unused_pyproject.txt

This file was deleted.

9 changes: 7 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
from prompt_systematic_review import collect_papers
from prompt_systematic_review import config_data


config_data.DataFolderPath = "./data"
config_data.DotenvPath = "./.env"
if not config_data.hasDownloadedPapers:
collect_papers()
collect_papers.collect()
config_data.hasDownloadedPapers = True


from prompt_systematic_review import experiments
import os

os.makedirs(config_data.DataFolderPath + os.sep + "experiments_output", exist_ok=True)
print("Running experiments...")
for experiment in experiments.experiments:
experiment.run()

print("Experiments completed. See data/experiments_output for output files")
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ sentence_transformers==2.2.2
tenacity==8.2.3
tiktoken==0.5.1
anytree==2.12.1
urllib3<2
acl-anthology-py
seaborn
-e .
18 changes: 16 additions & 2 deletions src/prompt_systematic_review/collect_papers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,19 @@

import pandas as pd
import PyPDF2
from PyPDF2.errors import PdfReadError
from prompt_systematic_review.utils.utils import process_paper_title

import openai
import tqdm
from dotenv import load_dotenv
import logging

# don't want to see warning messages when users are running
pdflogger = logging.getLogger("PyPDF2")
pdflogger.setLevel(logging.ERROR)
urlLogger = logging.getLogger("urllib3")
urlLogger.setLevel(logging.ERROR)

load_dotenv(dotenv_path=DotenvPath) # load all entries from .env file

Expand Down Expand Up @@ -63,6 +71,7 @@ def collect():
# clean ACL CSV
acl_df["title"] = acl_df["title"].apply(lambda x: process_paper_title(x))
acl_df["source"] = "ACL"

# combine dfs
combined_df = pd.concat([semantic_scholar_df, arxiv_df, acl_df])
# drop duplicates
Expand All @@ -76,6 +85,9 @@ def collect():

data = list(zip(deduplicated_df["url"].tolist(), deduplicated_df["title"].tolist()))

# make papers folder if it doesn't already exist
os.makedirs(os.path.join(DataFolderPath, "papers"), exist_ok=True)

NUM_PROCESSES = 12 # adjust as needed per your machine
with ThreadPoolExecutor(max_workers=NUM_PROCESSES) as executor:
executor.map(lambda p: downloadPaper(*p), data)
Expand Down Expand Up @@ -113,7 +125,9 @@ def collect():
os.remove(file_path)
# Drop the corresponding row from the dataframe
deduplicated_df = deduplicated_df[deduplicated_df["title"] != filename[:-4]]
print(f"Error processing {filename}: {e}")
# PDFRead Error is likely because of corrupted or empty PDF, can be ignored
if str(e) != "EOF marker not found":
print(f"Error processing {filename}: {e}")
# TODO: there is smtg weird going on here...

# Get a list of all the paper titles in the directory (without the .pdf extension)
Expand Down Expand Up @@ -187,5 +201,5 @@ def collect():
# Check if the file is a PDF and its title is not in df_titles
if filename.endswith(".pdf") and filename[:-4] not in df_titles:
# Remove the file
os.remove("papers/" + filename)
os.remove(DataFolderPath + os.sep + "papers" + os.sep + filename)
df_combined.to_csv(os.path.join(DataFolderPath, "master_papers.csv"))
2 changes: 1 addition & 1 deletion src/prompt_systematic_review/config_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

DataFolderPath = os.path.abspath("./data")
DotenvPath = os.path.abspath("./.env")
hasDownloadedPapers = True
hasDownloadedPapers = False


def setDownloadedPapers(hasDownloadedPapers):
Expand Down
3 changes: 2 additions & 1 deletion src/prompt_systematic_review/experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from . import graph_gpt_4_benchmarks200
from . import graph_gpt_3_5_benchmarks
from . import run_tomotopy
from . import topicgpt


experiments = [
count_models.Experiment,
count_tool_mentions.Experiment,
eval_prompts.Experiment,
evaluate_human_agreement.Experiment,
Expand All @@ -27,4 +27,5 @@
graph_gpt_4_benchmarks200.Experiment,
graph_gpt_3_5_benchmarks.Experiment,
run_tomotopy.Experiment,
topicgpt.Experiment,
]
2 changes: 1 addition & 1 deletion src/prompt_systematic_review/experiments/count_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def count_model_mentions(folder_path):

output_file_path = os.path.join(DataFolderPath, "model_citation_counts.csv")

with open(output_file_path, "w") as f:
with open(output_file_path, "w", encoding="utf-8") as f:
fieldnames = ["model_name", "count", "list_of_papers"]

# Create a CSV writer object
Expand Down
18 changes: 13 additions & 5 deletions src/prompt_systematic_review/experiments/count_tool_mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def count_tool_mentions(input_folder_path: str, output_file_path: str, tool_lst:

print("tool_counts: ", tool_counts)

with open(output_file_path, "w") as f:
with open(output_file_path, "w", encoding="utf-8") as f:
fieldnames = ["tool_name", "count", "list_of_papers"]

# Create a CSV writer object
Expand All @@ -55,7 +55,7 @@ def count_tool_mentions(input_folder_path: str, output_file_path: str, tool_lst:

def run_count_tool_mentions():
# script portion
masterpaperscsv_file_path = DataFolderPath
masterpaperscsv_file_path = os.path.join(DataFolderPath, "master_papers.csv")

# get all paper ids from our dataset
arxiv_papers_df = pd.read_csv(masterpaperscsv_file_path)
Expand Down Expand Up @@ -103,7 +103,9 @@ def run_count_tool_mentions():
]
count_tool_mentions(
papers_dataset_path,
os.path.join(DataFolderPath, "model_citation_counts.csv"),
os.path.join(
DataFolderPath, "experiments_output" + os.sep + "model_citation_counts.csv"
),
model_names,
)

Expand All @@ -123,7 +125,10 @@ def run_count_tool_mentions():
]
count_tool_mentions(
papers_dataset_path,
os.path.join(DataFolderPath, "dataset_citation_counts.csv"),
os.path.join(
DataFolderPath,
"experiments_output" + os.sep + "dataset_citation_counts.csv",
),
dataset_names,
)

Expand Down Expand Up @@ -159,7 +164,10 @@ def run_count_tool_mentions():
]
count_tool_mentions(
papers_dataset_path,
os.path.join(DataFolderPath, "framework_citation_counts.csv"),
os.path.join(
DataFolderPath,
"experiments_output" + os.sep + "framework_citation_counts.csv",
),
framework_names,
)

Expand Down
5 changes: 4 additions & 1 deletion src/prompt_systematic_review/experiments/eval_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def eval_prompts():
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# File path for the JSON file
file_path = os.path.join(DataFolderPath, "RP_eval_results_{current_datetime}.json")
file_path = os.path.join(
DataFolderPath,
"experiments_output" + os.sep + f"RP_eval_results_{current_datetime}.json",
)

# Writing the dictionary to a JSON file
with open(file_path, "w") as json_file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,19 @@ def evaluate_human_agreement(inputFile="arxiv_papers_with_abstract.csv"):
df.loc[i, "Probability"] = result["Probability"]
df.loc[i, "Reasoning"] = result["Reasoning"]

df.to_csv(os.path.join(DataFolderPath, "arxiv_papers_with_ai_labels.csv"))
df.to_csv(
os.path.join(
DataFolderPath,
"experiments_output" + os.sep + "arxiv_papers_with_ai_labels.csv",
)
)
blacklist = pd.read_csv(os.path.join(DataFolderPath, "blacklist.csv"))
blacklist["Title"] = blacklist["Title"].apply(lambda x: process_paper_title(x))
blacklist["title"] = blacklist["title"].apply(lambda x: process_paper_title(x))
df["title"] = df["title"].apply(lambda x: process_paper_title(x))

# df = df.iloc[400:800]
df_limited = df.copy() # .iloc[400:800]
df_limited["human_review"] = ~df_limited["title"].isin(blacklist["Title"])
df_limited["human_review"] = ~df_limited["title"].isin(blacklist["title"])
keepables = ["highly relevant", "somewhat relevant", "neutral"]

df_limited["AI_keep"] = df_limited["Probability"].map(
Expand Down
9 changes: 8 additions & 1 deletion src/prompt_systematic_review/experiments/generation_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import argparse
import os

# This file is used in tandem with topicgpt. topicgpt.py is the file that runs the functions in this file.

os.environ["TOKENIZERS_PARALLELISM"] = "false"


Expand Down Expand Up @@ -112,6 +114,9 @@ def generate_topics(
running_dups = 0
topic_format = regex.compile("^\[(\d+)\] ([\w\s]+):(.+)")

# Setup client
client = client_setup()

for i, doc in enumerate(tqdm(docs)):
prompt = prompt_formatting(
generation_prompt,
Expand All @@ -123,7 +128,9 @@ def generate_topics(
verbose,
)
try:
response = api_call(prompt, deployment_name, temperature, max_tokens, top_p)
response = api_call(
prompt, deployment_name, temperature, max_tokens, top_p, client
)
topics = response.split("\n")
for t in topics:
t = t.strip()
Expand Down
Loading