diff --git a/README.md b/README.md new file mode 100644 index 0000000..9495d80 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# Natural Language Processing with Amazon Reviews + +## Setup & Installation + +1. Install python3 and the necessary packages +2. To run `proProcessBagOfWords.ipynb`, you need to [download the spreadsheet files](https://www.kaggle.com/datasets/yacharki/amazon-reviews-for-sentianalysis-finegrained-csv) that exceed GitHub's limit. +3. There are some `test` and `train` spreadsheet files already generated if you want to skip step 2. \ No newline at end of file diff --git a/amazonNLP.ipynb b/amazonNLP.ipynb index 52b0359..039f325 100644 --- a/amazonNLP.ipynb +++ b/amazonNLP.ipynb @@ -308,15 +308,13 @@ " # Train with allWords\n", " clf.fit(X_train, y_train, **arg)\n", "\n", - " grid = HalvingGridSearchCV(clf, param_grid) # If we want Cross Validation: cv=pdfsplt\n", + " grid = HalvingGridSearchCV(clf, param_grid)\n", "\n", " grid.fit(X_paramTuning, y_paramTuning, **arg)\n", "\n", " print(\"Best grid params:\", grid.best_params_)\n", "\n", - " return grid.predict(testAllWords)\n", - "\n", - "# MultinomialNB()" + " return grid.predict(testAllWords)\n" ] }, { @@ -351,7 +349,7 @@ " currData = data[index]\n", " x, y, title = prepare_statistic(currData)\n", " axis = axs[i, j]\n", - " axis.set_ylim(bottom=0.0, top=1.0) # Uncomment this to see yAxis from [0, 1]\n", + " axis.set_ylim(bottom=0.0, top=1.0)\n", " axis.bar(x, y)\n", " axis.set_title(title)\n", " axis.grid(True)\n", @@ -521,8 +519,8 @@ "begin = time.time()\n", "prediction = predict(MultinomialNB(), sample_weight=1.0)\n", "end = time.time()\n", - "# prediction = gridPredict(MultinomialNB(), {\"alpha\": [0.0001, 0.5, 1.0]}, sample_weight=1.0)\n", "# prediction = fullPredict(MultinomialNB(), {\"alpha\": [0.0001, 0.5, 1.0]}, sample_weight=1.0)\n", + "\n", "print(\"MultinomialNB Results:\")\n", "accuracy = analyze(prediction, test, 'Multinomial Naive Bayes')\n", "nb_metrics = (accuracy, end - begin)"