-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping_data.py
118 lines (94 loc) · 4.46 KB
/
scraping_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import json
import time
def scrape_yelu(location_url, category, page):
try:
chrome_options = Options()
driver_path = 'C:\\Users\\admin\\Desktop\\selenium\\chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
base_url = 'https://www.yelu.in'
url = f'{base_url}/category/{category}/{page}/city:{location_url}'
driver.get(url)
# Let the page load
time.sleep(5)
wait = WebDriverWait(driver, 10)
companies = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'g_0')))
all_companies = []
for company in companies:
company_data = {}
try:
company_data['Company_name'] = company.find_element(By.TAG_NAME, 'h4').text.strip()
except NoSuchElementException as e:
print(f"Error locating company name element: {e}.")
company_data['Company_name'] = None
try:
company_data['Company_address'] = company.find_element(By.CLASS_NAME, 'address').text.strip()
except NoSuchElementException as e:
print(f"Error locating company address element: {e}.")
company_data['Company_address'] = None
try:
company_data['Company_description'] = company.find_element(By.CLASS_NAME, 'details').text.strip()
except NoSuchElementException as e:
print(f"Error locating company description element: {e}.")
company_data['Company_description'] = None
try:
company_data['Company_rating'] = company.find_element(By.CLASS_NAME, 'rate').text.strip()
except NoSuchElementException as e:
print(f"Error locating company rating element: {e}.")
company_data['Company_rating'] = None
try:
company_data['Company_verified_tag'] = company.find_element(By.CLASS_NAME, 'v').text.strip()
except NoSuchElementException as e:
print(f"Error locating company verified tag element: {e}.")
company_data['Company_verified_tag'] = None
try:
company_data['Company_icon'] = base_url + company.find_element(By.TAG_NAME, 'img').get_attribute('data-src')
except NoSuchElementException as e:
print(f"Error locating company icon element: {e}.")
company_data['Company_icon'] = None
try:
company_data['Link_to_individual_company_page'] = company.find_element(By.TAG_NAME, 'a').get_attribute('href')
except NoSuchElementException as e:
print(f"Error locating company link element: {e}.")
company_data['Link_to_individual_company_page'] = None
all_companies.append(company_data)
driver.quit()
return all_companies
except Exception as e:
print(f"Error occurred: {e}.")
return []
def save_to_json(data, filename):
try:
with open(filename, 'r') as json_file:
existing_data = json.load(json_file)
except FileNotFoundError:
existing_data = {}
for category, companies in data.items():
if category not in existing_data:
existing_data[category] = []
existing_data[category].extend(companies)
with open(filename, 'w') as json_file:
json.dump(existing_data, json_file, indent=4)
def main():
location_url = 'mumbai'
categories = ['restaurants','advertising']
all_companies = {}
for category in categories:
all_companies[category] = []
page = 1
while True:
companies = scrape_yelu(location_url, category, page)
if not companies:
break
all_companies[category].extend(companies)
page += 1
save_to_json(all_companies, 'yelu_data_with_page.json')
if __name__ == "__main__":
main()