This is a linear multiple regression model, I really appreciatte if I get your feedback
import pandas as pd
business = pd.read_json(‘yelp_business.json’, lines=True)
reviews = pd.read_json(‘yelp_review.json’,lines=True)
users = pd.read_json(‘yelp_user.json’,lines=True)
checkins = pd.read_json(‘yelp_checkin.json’,lines=True)
tips = pd.read_json(‘yelp_tip.json’,lines=True)
photos = pd.read_json(‘yelp_photo.json’,lines=True)
number_of_columns_to_display = 60
number_of_characters_to_display = 500
pd.options.display.max_columns = number_of_columns_to_display
pd.options.display.max_colwidth = number_of_characters_to_display
print(business.head(5))
How many businesses are in the dataset?
number_of_businesses = len(business)
print(“the number of business is”,number_of_businesses)
features in the review dataframe
print(" the reviews features are", reviews.columns)
what is the range of values for the features in the user data frame
for column in users.columns:
print(f"Feature: {column}“)
print(f"Range: {users[column].min()} to {users[column].max()}”)
print()
Assuming ‘business’ is my DataFrame and it has columns ‘business_id’ and ‘stars’
business_id = ‘5EvUIR4IzCWUOm0PsUZXjA’
yelp_rating = business[business[‘business_id’] == business_id][‘stars’]
Print the Yelp rating
print(yelp_rating)
Merge business and reviews DataFrames
df = pd.merge(business, reviews, how=‘left’, on=‘business_id’)
print(len(df)) # Ensure row count remains the same
Merge the users DataFrame
df = pd.merge(df, users, how=‘left’, on=‘business_id’)
print(len(df)) # Check the row count
Merge the checkins DataFrame
df = pd.merge(df, checkins, how=‘left’, on=‘business_id’)
print(len(df)) # Check the row count
Merge the tips DataFrame
df = pd.merge(df, tips, how=‘left’, on=‘business_id’, suffixes=(‘’, ‘_tip’))
print(len(df)) # Check the row count
Merge the photos DataFrame
df = pd.merge(df, photos, how=‘left’, on=‘business_id’, suffixes=(‘’, ‘_photo’))
print(len(df)) # Check the row count
#List of features to remove
features_to_remove = [‘address’,‘attributes’,‘business_id’,‘categories’,‘city’,‘hours’,‘is_open’,‘latitude’,‘longitude’,‘name’,‘neighborhood’,‘postal_code’,‘state’,‘time’]
#Removing the features
df.drop(features_to_remove, axis=1, inplace=True)
print(df.columns)
#Remove missing values
df.isna().any()
#Substitution of NAN values
df.fillna({‘weekday_checkins’:0,
‘weekend_checkins’:0,
‘average_tip_length’:0,
‘number_tips’:0,
‘number_pics’:0,
‘average_caption_length’:0},
inplace=True)
#confirm if it is correct now
df.isna().any()
#Correlation analysis
df.corr()
import matplotlib.pyplot as plt
#Three features that correlate most with Yelp Rating
Plotting average_review_sentiment vs stars
plt.figure(figsize=(18, 6))
plt.scatter(df[‘average_review_sentiment’], df[‘stars’], alpha=0.5)
plt.title(‘Average Review Sentiment vs Yelp Rating (Stars)’)
plt.xlabel(‘Average Review Sentiment’)
plt.ylabel(‘Yelp Rating (Stars)’)
plt.show()
Plotting average_review_length vs stars
plt.figure(figsize=(18, 6))
plt.scatter(df[‘average_review_length’], df[‘stars’], alpha=0.5)
plt.title(‘Average Review Length vs Yelp Rating (Stars)’)
plt.xlabel(‘Average Review Length’)
plt.ylabel(‘Yelp Rating (Stars)’)
plt.show()
Plotting average_review_age vs stars
plt.figure(figsize=(18, 6))
plt.scatter(df[‘average_review_age’], df[‘stars’], alpha=0.5)
plt.title(‘Average Review Age vs Yelp Rating (Stars)’)
plt.xlabel(‘Average Review Age’)
plt.ylabel(‘Yelp Rating (Stars)’)
plt.show()
from sklearn.model_selection import train_test_split
Splitting the data into features (X) and target variable (y)
X = df[[‘average_review_length’, ‘average_review_age’]]
y = df[‘stars’] # Target variable
Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
Ensure that df only contains the correct columns
Assuming ‘df’ is already defined and contains only numeric features plus ‘stars’
numeric_columns = [‘average_review_length’, ‘average_review_age’, ‘average_review_sentiment’,
‘review_count’, ‘number_funny_votes’, ‘number_cool_votes’, ‘number_useful_votes’,
‘average_number_friends’, ‘average_days_on_yelp’, ‘average_number_fans’,
‘average_review_count’, ‘average_number_years_elite’, ‘weekday_checkins’,
‘weekend_checkins’, ‘average_tip_length’, ‘number_tips’, ‘average_caption_length’,
‘number_pics’]
Select the relevant columns
df = df[[‘stars’] + numeric_columns]
Split data into features and target
ratings = df[‘stars’]
features = df[numeric_columns]
Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=1)
Create a Linear Regression model
model = LinearRegression()
Train the model
model.fit(X_train, y_train)
Predict the Yelp ratings for the testing data
y_predicted = model.predict(X_test)
Plot y_test vs y_predicted
plt.scatter(y_test, y_predicted)
plt.xlabel(‘Actual Yelp Rating’)
plt.ylabel(‘Predicted Yelp Rating’)
plt.title(‘Actual vs Predicted Yelp Ratings’)
plt.plot([1, 5], [1, 5], color=‘red’) # Line of perfect prediction
plt.ylim(1, 5)
plt.xlim(1, 5)
plt.show()
Calculate and print the scores
print(‘Train Score:’, model.score(X_train, y_train))
print(‘Test Score:’, model.score(X_test, y_test))
Print the model features and their corresponding coefficients
print(sorted(list(zip(numeric_columns, model.coef_)), key=lambda x: abs(x[1]), reverse=True))
model.score(X_train,y_train)
model.score(X_test, y_test)
sorted(list(zip([‘average_review_length’,‘average_review_age’],model.coef_)),key = lambda x: abs(x[1]),reverse=True)
y_predicted = model.predict(X_test)
Plot y_test vs y_predicted
plt.scatter(y_test, y_predicted, alpha=0.5)
plt.xlabel(“Actual Yelp Ratings”)
plt.ylabel(“Predicted Yelp Ratings”)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color=‘red’) # Line y=x
plt.title(“Actual vs Predicted Yelp Ratings”)
plt.show()
subset of only average review sentiment
sentiment = [‘average_review_sentiment’]
subset of all features that have a response range [0,1]
binary_features = [‘alcohol?’,‘has_bike_parking’,‘takes_credit_cards’,‘good_for_kids’,‘take_reservations’,‘has_wifi’]
all features
all_features = binary_features + numeric_features
add your own feature subset here
fans = [‘average_number_fans’]
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
Assume df
is a DataFrame that has already been defined with the necessary columns
Define the function to model selected features
def model_these_features(feature_list):
# Extract the target variable (Yelp ratings)
ratings = df.loc[:, 'stars']
# Extract the features based on the provided feature list
features = df.loc[:, feature_list]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=1)
# Ensure the training and testing data are in the correct shape for a single feature
if len(X_train.shape) < 2:
X_train = np.array(X_train).reshape(-1, 1)
X_test = np.array(X_test).reshape(-1, 1)
# Create a Linear Regression model
model = LinearRegression()
# Fit the model to the training data
model.fit(X_train, y_train)
# Print the R^2 score (coefficient of determination) for the training data
print('Train Score:', model.score(X_train, y_train))
# Print the R^2 score for the testing data
print('Test Score:', model.score(X_test, y_test))
# Print the model features and their corresponding coefficients, sorted by predictive power
print(sorted(list(zip(feature_list, model.coef_)), key=lambda x: abs(x[1]), reverse=True))
# Predict the Yelp ratings for the testing data
y_predicted = model.predict(X_test)
# Plot the actual vs predicted Yelp ratings
plt.scatter(y_test, y_predicted)
plt.xlabel('Actual Yelp Rating')
plt.ylabel('Predicted Yelp Rating')
plt.ylim(1, 5)
plt.title('Actual vs Predicted Yelp Ratings')
plt.plot([1, 5], [1, 5], color='red') # Line of perfect prediction
plt.show()
Example usage:
Assuming df
has already been defined and contains numeric features plus ‘stars’
numeric_columns = [‘average_review_length’, ‘average_review_age’, ‘average_review_sentiment’,
‘review_count’, ‘number_funny_votes’, ‘number_cool_votes’, ‘number_useful_votes’,
‘average_number_friends’, ‘average_days_on_yelp’, ‘average_number_fans’,
‘average_review_count’, ‘average_number_years_elite’, ‘weekday_checkins’,
‘weekend_checkins’, ‘average_tip_length’, ‘number_tips’, ‘average_caption_length’,
‘number_pics’]
Ensure df only contains the relevant columns
df = df[[‘stars’] + numeric_columns]
Model a specific set of features
model_these_features([‘average_review_sentiment’, ‘average_review_length’, ‘average_review_age’, ‘number_funny_votes’])
danielles_delicious_delicacies = np.array([
596.463567, # average_review_length: mean
1175.501021, # average_review_age: mean
0.8, # average_review_sentiment: assumed positive sentiment
100, # review_count: assumed relatively high
50, # number_funny_votes: assumed higher than average
50, # number_cool_votes: assumed higher than average
100, # number_useful_votes: assumed higher than average
200, # average_number_friends: assumed higher than average
2000, # average_days_on_yelp: mean
20, # average_number_fans: assumed higher than average
200, # average_review_count: assumed higher than average
1, # average_number_years_elite: mean
100, # weekday_checkins: assumed higher than average
100, # weekend_checkins: assumed higher than average
45.643426, # average_tip_length: mean
10, # number_tips: assumed higher than average
2.831829, # average_caption_length: mean
5 # number_pics: assumed higher than average
]).reshape(1, -1)
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
Assuming df is already defined and contains the necessary data
print(df.columns)
Adjust feature lists if necessary
sentiment = [‘average_review_sentiment’]
numeric_features = [‘average_review_length’, ‘average_review_age’, ‘review_count’, ‘number_funny_votes’,
‘number_cool_votes’, ‘number_useful_votes’, ‘average_number_friends’, ‘average_days_on_yelp’,
‘average_number_fans’, ‘average_review_count’, ‘average_number_years_elite’, ‘weekday_checkins’,
‘weekend_checkins’, ‘average_tip_length’, ‘number_tips’, ‘average_caption_length’, ‘number_pics’]
all_features = sentiment + numeric_features
all_features = [feature for feature in all_features if feature in df.columns]
Ensure the features DataFrame matches the all_features
list
features = df.loc[:, all_features]
ratings = df.loc[:, ‘stars’]
Split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=1)
Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
Make the prediction for Danielle’s Delicious Delicacies
danielles_delicious_delicacies = np.array([
596.463567, 1175.501021, 0.8, 100, 50, 50, 100, 200, 2000, 20, 200, 1, 100, 100, 45.643426, 10, 2.831829, 5
]).reshape(1, -1)
Scale the input features for prediction
danielles_delicious_delicacies_scaled = scaler.transform(danielles_delicious_delicacies)
Predict the rating
predicted_rating = model.predict(danielles_delicious_delicacies_scaled)
print(“Predicted Yelp Rating for Danielle’s Delicious Delicacies:”, predicted_rating[0])
Evaluate model performance
print(‘Train Score:’, model.score(X_train_scaled, y_train))
print(‘Test Score:’, model.score(X_test_scaled, y_test))
Plot actual vs predicted Yelp ratings
y_predicted = model.predict(X_test_scaled)
plt.scatter(y_test, y_predicted)
plt.xlabel(‘Actual Yelp Rating’)
plt.ylabel(‘Predicted Yelp Rating’)
plt.ylim(1, 5)
plt.title(‘Actual vs Predicted Yelp Ratings’)
plt.plot([1, 5], [1, 5], color=‘red’) # Line of perfect prediction
plt.show()