-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_cleanser.py
116 lines (87 loc) · 3.8 KB
/
data_cleanser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import re
from bs4 import BeautifulSoup, Comment
from typing import List
def remove_sections_based_on_comments(content: str, drop_keywords: List[str]) -> str:
"""
Removes sections of the content surrounded by comments containing keywords from drop_keywords.
Also removes standalone comment keywords.
Args:
- content: The content string to be cleansed.
- drop_keywords: List of keywords to drop from the content.
Returns:
- Cleaned content as a string.
"""
for keyword in drop_keywords:
# Remove sections surrounded by the keyword
content = re.sub(r'<!--\s*' + keyword + r'\s*-->(.*?)<!--\s*/' + keyword + r'\s*-->', '', content, flags=re.DOTALL)
# Remove standalone comment keywords
content = re.sub(r'<!--\s*(/)?' + keyword + r'\s*-->', '', content)
return content.strip()
def filter_out_stop_words(content: str, stop_words: List[str]) -> str:
"""
Filters out lines in the content that contain any of the stop words.
Args:
- content: The content string to be filtered.
- stop_words: List of stop words to filter out from the content.
Returns:
- Content string after filtering stop words.
"""
lines = content.split('\n')
filtered_lines = [line for line in lines if not any(word in line.upper() for word in stop_words)]
return '\n'.join(filtered_lines)
def save_to_file(content: str, output_file_path: str) -> None:
"""
Saves the content to the specified file.
Args:
- content: The content string to be saved.
- output_file_path: The path where content will be saved.
Returns:
- None
"""
with open(output_file_path, 'w') as file:
file.write(content)
def remove_html_tags_and_comments(content: str) -> str:
"""
Extracts text content using BeautifulSoup, discarding all tags.
Args:
- content: The content string from which HTML tags and comments need to be removed.
Returns:
- Cleaned content as a string.
"""
soup = BeautifulSoup(content, 'lxml')
# Remove comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
return soup.get_text(separator="\n", strip=True)
def create_cleansed_file(input_file: str, PREFIX_CLEAN: str, WP_TERMS: List[str], STOP_WORDS: List[str], name_space: str) -> str:
"""
Cleanses the input file content based on given terms and stop words.
Args:
- input_file: Path to the input file.
- PREFIX_CLEAN: Prefix for the cleansed file.
- WP_TERMS: List of WordPress terms to be removed.
- STOP_WORDS: List of stop words to be filtered out.
- name_space: Namespace used for directory structure.
Returns:
- Path to the cleansed file.
"""
drop_sentences = WP_TERMS + STOP_WORDS
# Read content from the input file
with open(input_file, 'r') as file:
content = file.read()
# Remove sections based on comments
cleaned_content = remove_sections_based_on_comments(content, drop_sentences)
# Filter out stop words
filtered_content = filter_out_stop_words(cleaned_content, STOP_WORDS)
no_html_content = remove_html_tags_and_comments(filtered_content)
# Derive the output file path based on the desired structure
output_file_name = f"{PREFIX_CLEAN}{os.path.basename(input_file).replace('scraped_', '').replace('.xml', '.txt')}"
output_dir = os.path.join("website_content", name_space)
os.makedirs(output_dir, exist_ok=True)
output_file_path = os.path.join(output_dir, output_file_name)
save_to_file(no_html_content, output_file_path)
print(f"Cleaned content saved to {output_file_path}")
return output_file_path
if __name__ == "__main__":
cleaned_file_path = create_cleansed_file(input_file, PREFIX_CLEAN, WP_TERMS, STOP_WORDS)