I solved the basic items of the ‘Chocolate Scraping with Beautiful Soup’ Data Acquisition exercise using List Comprehension, which is a nifty python feature that may not be so straightforward to quickly decipher but is very useful to keep the code short and to the point and I thought of sharing it because some people might find it useful.
I wanted to created the lists (like ‘companies’ and ‘cocoa_percent’) from the BeautifulSoup scraping directly, without using auxiliary lists and without wasting lines in “full-blown” ‘for…in’ structures.
This the my code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
url = 'https://content.codecademy.com/courses/beautifulsoup/cacao/index.html'
webpage = requests.get(url)
soup = BeautifulSoup(str(webpage.content),'html.parser')
ratings = [ float(rating.get_text()) for rating in soup.find_all(attrs={"class": "Rating"}) if rating.get_text() != "Rating" ]
#print(ratings)
plt.hist(ratings)
#plt.show()
companies = [ company.get_text() for company in soup.find_all(attrs={"class": "Company"}) ]
companies.pop(0)
#print(companies)
cocoa_percent = [ float(cocoa.get_text().rstrip("%"))/100 for cocoa in soup.find_all(attrs={"class": "CocoaPercent"}) if cocoa.get_text() != "Cocoa\\n Percent\\n " ]
#print(cocoa_percent)
df = pd.DataFrame.from_dict({'Company':companies, 'Rating':ratings, 'CocoaPercentage':cocoa_percent})
#print(df)
mean_ratings = df.groupby('Company').Rating.mean()
#print(mean_ratings)
ten_best = mean_ratings.nlargest(10)
#print(ten_best)
plt.scatter(df.CocoaPercentage, df.Rating)
z = np.polyfit(df.CocoaPercentage, df.Rating, 1)
line_function = np.poly1d(z)
plt.plot(df.CocoaPercentage, line_function(df.CocoaPercentage), "r--")
#plt.show()