How To Scrape Specific Text From Specific Table Elements
I am trying to scrape specific text from specific table elements on an Amazon product page. URL_1 has all elements - https://www.amazon.com/dp/B008Q5LXIE/ URL_2 has only 'Sales Ra
Solution 1:
You could use stripped_strings and :contains with bs4 4.7.1. This feels like a lot of jiggery pokery to get the desired output format. Sure someone with more python experience could reduce this and improve its efficiency. Merging dicts syntax taken from @aaronhall.
import requests
from bs4 import BeautifulSoup as bs
import re
links = ['https://www.amazon.com/Professional-Dental-Guard-Remoldable-Customizable/dp/B07L4YHBQ4', 'https://www.amazon.com/dp/B0040ODFK4/?tag=stackoverfl08-20']
for link in links:
r = requests.get(link, headers = {'User-Agent': 'Mozilla\5.0'})
soup = bs(r.content, 'lxml')
fields = ['Product Dimensions', 'Shipping Weight', 'Item model number', 'Amazon Best Sellers Rank']
temp_dict = {}
for field in fields:
element = soup.select_one('li:contains("' + field + '")')
if element is None:
temp_dict[field] = 'N/A'
else:
if field == 'Amazon Best Sellers Rank':
item = [re.sub('#|\(','', string).strip() for string in soup.select_one('li:contains("' + field + '")').stripped_strings][1].split(' in ')
temp_dict[field] = item
else:
item = [string for string in element.stripped_strings][1]
temp_dict[field] = item.replace('(', '').strip()
ranks = soup.select('.zg_hrsr_rank')
ladders = soup.select('.zg_hrsr_ladder')
if ranks:
cat_nos = [item.text.split('#')[1] for item in ranks]
else:
cat_nos = ['N/A']
if ladders:
cats = [item.text.split('\xa0')[1] for item in soup.select('.zg_hrsr_ladder')]
else:
cats = ['N/A']
rankings = dict(zip(cat_nos, cats))
map_dict = {
'Product Dimensions': 'dimensions',
'Shipping Weight': 'weight',
'Item model number': 'Item_No',
'Amazon Best Sellers Rank': ['R1_NO','R1_CAT']
}
final_dict = {}
for k,v in temp_dict.items():
if k == 'Amazon Best Sellers Rank' and v!= 'N/A':
item = dict(zip(map_dict[k],v))
final_dict = {**final_dict, **item}
elif k == 'Amazon Best Sellers Rank' and v == 'N/A':
item = dict(zip(map_dict[k], [v, v]))
final_dict = {**final_dict, **item}
else:
final_dict[map_dict[k]] = v
for k,v in enumerate(rankings):
#print(k + 1, v, rankings[v])
prefix = 'R' + str(k + 2) + '_'
final_dict[prefix + 'NO'] = v
final_dict[prefix + 'CAT'] = rankings[v]
print(final_dict)
Post a Comment for "How To Scrape Specific Text From Specific Table Elements"