Multiple linear regression analysis

This is a linear multiple regression model, I really appreciatte if I get your feedback

import pandas as pd
business = pd.read_json(‘yelp_business.json’, lines=True)
reviews = pd.read_json(‘yelp_review.json’,lines=True)
users = pd.read_json(‘yelp_user.json’,lines=True)
checkins = pd.read_json(‘yelp_checkin.json’,lines=True)
tips = pd.read_json(‘yelp_tip.json’,lines=True)
photos = pd.read_json(‘yelp_photo.json’,lines=True)

number_of_columns_to_display = 60
number_of_characters_to_display = 500

pd.options.display.max_columns = number_of_columns_to_display
pd.options.display.max_colwidth = number_of_characters_to_display

print(business.head(5))

How many businesses are in the dataset?

number_of_businesses = len(business)
print(“the number of business is”,number_of_businesses)

features in the review dataframe

print(" the reviews features are", reviews.columns)

what is the range of values for the features in the user data frame

for column in users.columns:
print(f"Feature: {column}“)
print(f"Range: {users[column].min()} to {users[column].max()}”)
print()

Assuming ‘business’ is my DataFrame and it has columns ‘business_id’ and ‘stars’

business_id = ‘5EvUIR4IzCWUOm0PsUZXjA’
yelp_rating = business[business[‘business_id’] == business_id][‘stars’]

Print the Yelp rating

print(yelp_rating)

Merge business and reviews DataFrames

df = pd.merge(business, reviews, how=‘left’, on=‘business_id’)
print(len(df)) # Ensure row count remains the same

Merge the users DataFrame

df = pd.merge(df, users, how=‘left’, on=‘business_id’)
print(len(df)) # Check the row count

Merge the checkins DataFrame

df = pd.merge(df, checkins, how=‘left’, on=‘business_id’)
print(len(df)) # Check the row count

Merge the tips DataFrame

df = pd.merge(df, tips, how=‘left’, on=‘business_id’, suffixes=(‘’, ‘_tip’))
print(len(df)) # Check the row count

Merge the photos DataFrame

df = pd.merge(df, photos, how=‘left’, on=‘business_id’, suffixes=(‘’, ‘_photo’))
print(len(df)) # Check the row count

#List of features to remove
features_to_remove = [‘address’,‘attributes’,‘business_id’,‘categories’,‘city’,‘hours’,‘is_open’,‘latitude’,‘longitude’,‘name’,‘neighborhood’,‘postal_code’,‘state’,‘time’]

#Removing the features

df.drop(features_to_remove, axis=1, inplace=True)
print(df.columns)
#Remove missing values
df.isna().any()

#Substitution of NAN values

df.fillna({‘weekday_checkins’:0,
‘weekend_checkins’:0,
‘average_tip_length’:0,
‘number_tips’:0,
‘number_pics’:0,
‘average_caption_length’:0},
inplace=True)

#confirm if it is correct now

df.isna().any()

#Correlation analysis

df.corr()

import matplotlib.pyplot as plt

#Three features that correlate most with Yelp Rating

Plotting average_review_sentiment vs stars

plt.figure(figsize=(18, 6))
plt.scatter(df[‘average_review_sentiment’], df[‘stars’], alpha=0.5)
plt.title(‘Average Review Sentiment vs Yelp Rating (Stars)’)
plt.xlabel(‘Average Review Sentiment’)
plt.ylabel(‘Yelp Rating (Stars)’)
plt.show()

Plotting average_review_length vs stars

plt.figure(figsize=(18, 6))
plt.scatter(df[‘average_review_length’], df[‘stars’], alpha=0.5)
plt.title(‘Average Review Length vs Yelp Rating (Stars)’)
plt.xlabel(‘Average Review Length’)
plt.ylabel(‘Yelp Rating (Stars)’)
plt.show()

Plotting average_review_age vs stars

plt.figure(figsize=(18, 6))
plt.scatter(df[‘average_review_age’], df[‘stars’], alpha=0.5)
plt.title(‘Average Review Age vs Yelp Rating (Stars)’)
plt.xlabel(‘Average Review Age’)
plt.ylabel(‘Yelp Rating (Stars)’)
plt.show()

from sklearn.model_selection import train_test_split

Splitting the data into features (X) and target variable (y)

X = df[[‘average_review_length’, ‘average_review_age’]]
y = df[‘stars’] # Target variable

Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Ensure that df only contains the correct columns

Assuming ‘df’ is already defined and contains only numeric features plus ‘stars’

numeric_columns = [‘average_review_length’, ‘average_review_age’, ‘average_review_sentiment’,
‘review_count’, ‘number_funny_votes’, ‘number_cool_votes’, ‘number_useful_votes’,
‘average_number_friends’, ‘average_days_on_yelp’, ‘average_number_fans’,
‘average_review_count’, ‘average_number_years_elite’, ‘weekday_checkins’,
‘weekend_checkins’, ‘average_tip_length’, ‘number_tips’, ‘average_caption_length’,
‘number_pics’]

Select the relevant columns

df = df[[‘stars’] + numeric_columns]

Split data into features and target

ratings = df[‘stars’]
features = df[numeric_columns]

Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=1)

Create a Linear Regression model

model = LinearRegression()

Train the model

model.fit(X_train, y_train)

Predict the Yelp ratings for the testing data

y_predicted = model.predict(X_test)

Plot y_test vs y_predicted

plt.scatter(y_test, y_predicted)
plt.xlabel(‘Actual Yelp Rating’)
plt.ylabel(‘Predicted Yelp Rating’)
plt.title(‘Actual vs Predicted Yelp Ratings’)
plt.plot([1, 5], [1, 5], color=‘red’) # Line of perfect prediction
plt.ylim(1, 5)
plt.xlim(1, 5)
plt.show()

Calculate and print the scores

print(‘Train Score:’, model.score(X_train, y_train))
print(‘Test Score:’, model.score(X_test, y_test))

Print the model features and their corresponding coefficients

print(sorted(list(zip(numeric_columns, model.coef_)), key=lambda x: abs(x[1]), reverse=True))

model.score(X_train,y_train)

model.score(X_test, y_test)

sorted(list(zip([‘average_review_length’,‘average_review_age’],model.coef_)),key = lambda x: abs(x[1]),reverse=True)

y_predicted = model.predict(X_test)

Plot y_test vs y_predicted

plt.scatter(y_test, y_predicted, alpha=0.5)
plt.xlabel(“Actual Yelp Ratings”)
plt.ylabel(“Predicted Yelp Ratings”)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color=‘red’) # Line y=x
plt.title(“Actual vs Predicted Yelp Ratings”)
plt.show()

subset of only average review sentiment

sentiment = [‘average_review_sentiment’]

subset of all features that have a response range [0,1]

binary_features = [‘alcohol?’,‘has_bike_parking’,‘takes_credit_cards’,‘good_for_kids’,‘take_reservations’,‘has_wifi’]

all features

all_features = binary_features + numeric_features

add your own feature subset here

fans = [‘average_number_fans’]

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Assume df is a DataFrame that has already been defined with the necessary columns

Define the function to model selected features

def model_these_features(feature_list):

# Extract the target variable (Yelp ratings)
ratings = df.loc[:, 'stars']

# Extract the features based on the provided feature list
features = df.loc[:, feature_list]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=1)

# Ensure the training and testing data are in the correct shape for a single feature
if len(X_train.shape) < 2:
    X_train = np.array(X_train).reshape(-1, 1)
    X_test = np.array(X_test).reshape(-1, 1)

# Create a Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Print the R^2 score (coefficient of determination) for the training data
print('Train Score:', model.score(X_train, y_train))

# Print the R^2 score for the testing data
print('Test Score:', model.score(X_test, y_test))

# Print the model features and their corresponding coefficients, sorted by predictive power
print(sorted(list(zip(feature_list, model.coef_)), key=lambda x: abs(x[1]), reverse=True))

# Predict the Yelp ratings for the testing data
y_predicted = model.predict(X_test)

# Plot the actual vs predicted Yelp ratings
plt.scatter(y_test, y_predicted)
plt.xlabel('Actual Yelp Rating')
plt.ylabel('Predicted Yelp Rating')
plt.ylim(1, 5)
plt.title('Actual vs Predicted Yelp Ratings')
plt.plot([1, 5], [1, 5], color='red')  # Line of perfect prediction
plt.show()

Example usage:

Assuming df has already been defined and contains numeric features plus ‘stars’

numeric_columns = [‘average_review_length’, ‘average_review_age’, ‘average_review_sentiment’,
‘review_count’, ‘number_funny_votes’, ‘number_cool_votes’, ‘number_useful_votes’,
‘average_number_friends’, ‘average_days_on_yelp’, ‘average_number_fans’,
‘average_review_count’, ‘average_number_years_elite’, ‘weekday_checkins’,
‘weekend_checkins’, ‘average_tip_length’, ‘number_tips’, ‘average_caption_length’,
‘number_pics’]

Ensure df only contains the relevant columns

df = df[[‘stars’] + numeric_columns]

Model a specific set of features

model_these_features([‘average_review_sentiment’, ‘average_review_length’, ‘average_review_age’, ‘number_funny_votes’])

danielles_delicious_delicacies = np.array([
596.463567, # average_review_length: mean
1175.501021, # average_review_age: mean
0.8, # average_review_sentiment: assumed positive sentiment
100, # review_count: assumed relatively high
50, # number_funny_votes: assumed higher than average
50, # number_cool_votes: assumed higher than average
100, # number_useful_votes: assumed higher than average
200, # average_number_friends: assumed higher than average
2000, # average_days_on_yelp: mean
20, # average_number_fans: assumed higher than average
200, # average_review_count: assumed higher than average
1, # average_number_years_elite: mean
100, # weekday_checkins: assumed higher than average
100, # weekend_checkins: assumed higher than average
45.643426, # average_tip_length: mean
10, # number_tips: assumed higher than average
2.831829, # average_caption_length: mean
5 # number_pics: assumed higher than average
]).reshape(1, -1)

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

Assuming df is already defined and contains the necessary data

print(df.columns)

Adjust feature lists if necessary

sentiment = [‘average_review_sentiment’]
numeric_features = [‘average_review_length’, ‘average_review_age’, ‘review_count’, ‘number_funny_votes’,
‘number_cool_votes’, ‘number_useful_votes’, ‘average_number_friends’, ‘average_days_on_yelp’,
‘average_number_fans’, ‘average_review_count’, ‘average_number_years_elite’, ‘weekday_checkins’,
‘weekend_checkins’, ‘average_tip_length’, ‘number_tips’, ‘average_caption_length’, ‘number_pics’]
all_features = sentiment + numeric_features
all_features = [feature for feature in all_features if feature in df.columns]

Ensure the features DataFrame matches the all_features list

features = df.loc[:, all_features]
ratings = df.loc[:, ‘stars’]

Split the data and train the model

X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=1)

Standardize the features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Train the model

model = LinearRegression()
model.fit(X_train_scaled, y_train)

Make the prediction for Danielle’s Delicious Delicacies

danielles_delicious_delicacies = np.array([
596.463567, 1175.501021, 0.8, 100, 50, 50, 100, 200, 2000, 20, 200, 1, 100, 100, 45.643426, 10, 2.831829, 5
]).reshape(1, -1)

Scale the input features for prediction

danielles_delicious_delicacies_scaled = scaler.transform(danielles_delicious_delicacies)

Predict the rating

predicted_rating = model.predict(danielles_delicious_delicacies_scaled)
print(“Predicted Yelp Rating for Danielle’s Delicious Delicacies:”, predicted_rating[0])

Evaluate model performance

print(‘Train Score:’, model.score(X_train_scaled, y_train))
print(‘Test Score:’, model.score(X_test_scaled, y_test))

Plot actual vs predicted Yelp ratings

y_predicted = model.predict(X_test_scaled)
plt.scatter(y_test, y_predicted)
plt.xlabel(‘Actual Yelp Rating’)
plt.ylabel(‘Predicted Yelp Rating’)
plt.ylim(1, 5)
plt.title(‘Actual vs Predicted Yelp Ratings’)
plt.plot([1, 5], [1, 5], color=‘red’) # Line of perfect prediction
plt.show()

Can you push the notebook to a GitHub repo so people can view the results of the code (rather than just posting here)?

Hi , sorry this is the github repository :