I tried reducing the parameters to the five largest coefs (I went ahead and used the abs of the coefs since I don’t know the realationship of +/- coeffs). I then made a new dataframe for x values with just the new five columns. Used linear regression and the R^2 dropped to 0.61. My guess was wrong.
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 20:20:50 2020
@author: 12253
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import numpy as np
df = pd.read_csv('manhattan.csv')
#list all parameters to compare to y
x = df[['bedrooms',
'bathrooms',
'size_sqft',
'min_to_subway',
'floor',
'building_age_yrs',
'no_fee',
'has_roofdeck',
'has_washer_dryer',
'has_doorman',
'has_elevator',
'has_dishwasher',
'has_patio',
'has_gym']]
#main comparison
y = df['rent']
#splitting data 80% to build model and 20% to test model
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)
#Build regression model and predict y values with the x_test values
mlr = LinearRegression()
mlr.fit(x_train, y_train)
y_predict = mlr.predict(x_test)
#Scatter plot of real ys vs test ys
plt.scatter(y_test, y_predict)
plt.xlabel('Predicted Price ($)')
plt.ylabel('Actual Price ($)')
plt.title('Predicted Prices vs Actual Prices')
plt.show()
labels = ['bedrooms',
'bathrooms',
'size_sqft',
'min_to_subway',
'floor',
'building_age_yrs',
'no_fee',
'has_roofdeck',
'has_washer_dryer',
'has_doorman',
'has_elevator',
'has_dishwasher',
'has_patio',
'has_gym']
for x in labels:
plt.scatter(df[x], df['rent'], alpha=0.4)
plt.title(x)
plt.show()
print(mlr.score(x_train, y_train))
print(mlr.score(x_test, y_test))
print(mlr.coef_)
coefs = mlr.coef_
coefs_abs = [abs(x) for x in coefs]
#I tried only using the 5 largest abs(coef) to see if that would fit better. R2 dropped so my idea of only using high coef variables was incorrect
#made a list of lists of labels and coefs then sorted by coefs and seperated the top 5 parameters
combined = [[coefs_abs[i],labels[i]] for i in range(len(labels))]
sorted_combined = sorted(combined)
top_parameters = sorted_combined[-5:]
top_parameters_labels = [top_parameters[i][1] for i in range(len(top_parameters))]
#Create df of top five parameters and fit using linear regression
X = df[top_parameters_labels]
x_train2, x_test2, y_train2, y_test2 = train_test_split(X, y, train_size=0.8, test_size=0.2)
mlr2 = LinearRegression()
mlr2.fit(x_train2, y_train2)
y_predict2 = mlr2.predict(x_test2)
print(mlr2.score(x_train2, y_train2))
print(mlr2.score(x_test2, y_test2))
print(mlr2.coef_)