-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataTransformation.py
37 lines (30 loc) · 1.27 KB
/
dataTransformation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
## filePath:
df = pd.read_csv('./moviesList.csv')
### movie id filtering:
duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]
if not duplicate_ids.empty:
print("Duplicate IDs found in the CSV data. Removing duplicates...")
df.drop_duplicates(subset=['id'], keep='first', inplace=True)
df.to_csv('./filteredMoviesList.csv', index=False)
print("Duplicates removed")
else:
print("No duplicate IDs found in the CSV data.")
### genre id filtering:
print("Updating genre_ids....")
df = df[df['adult'].notnull()]
df = df[df['backdrop_path'].notnull()]
df = df[df['video'].notnull()]
df = df[df['release_date'].notnull()]
df['genre_ids'] = df['genre_ids'].str.replace('[', '').str.replace(']', '').str.split(',').str[0]
df['genre_ids'] = df['genre_ids'].astype(str).replace('', '10770')
df['genre_ids'] = df['genre_ids'].astype(int)
df.to_csv('./filteredMoviesList.csv', index=False)
print('Updated genre_ids!')
### Title filtering with 'S' and 'H':
print("Filtering the titles....")
filtered_df = df[df['title'].str.startswith(('S', 'H'))]
filtered_df = filtered_df[filtered_df['adult'].notnull()]
filtered_df = filtered_df[filtered_df['backdrop_path'].notnull()]
filtered_df.to_csv('./filteredMoviesList.csv', index=False)
print("Filtered the titles!")