Machine Learning Capstone Project

So as per what Codecademy wanted, I’m posting my Capstone project here:
Presentation
There’s definitely an infinite amount of ways I could have done this better, so some suggestions would be nice.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv("profiles.csv")

#How can we predict a user's gender with different factors?
#We could use status, income, job, height, body type, and orientation

#Can we predict someone's height with their diet, body type, gender, and drugs?
#Map everything
df["status_code"] = df.status.map({"single":0, "seeing_someone":1, "available":2, "married":3, "unknown":4})
df["sex_code"] = df.sex.map({"m":0, "f":1})
df["sexuality_code"] = df.orientation.map({"straight": 0, "gay": 1, "bisexual": 2})
df["job_code"] = df.job.map({"other": 0, "student": 1, "science / tech / engineering": 2,
                                   "computer / hardware / software": 3, "artistic / musical / writer": 4,
                                   "sales / marketing / biz dev": 5, "medicine / health": 6, "education / academia": 6,
                                   "executive / management": 7, "banking / financial / real estate": 8, 
                                   "entertainment / media": 9, "law / legal services": 10, "hospitality / travel": 11,
                                   "construction / craftsmanship": 12, "clerical / administrative": 13, 
                                   "political / government": 14, "rather not say": 15, "transportation": 16, 
                                   "unemployed": 17, "retired": 18, "military": 19})
df["body_code"] = df.body_type.map({"average": 0, "fit": 1, "athletic": 2, "thin": 3, "curvy": 4,
                                   "a little extra": 5, "skinny": 6, "full figured": 7, "overweight": 8,
                                   "jacked": 9, "used up": 10, "rather not say": 11})
#Normalize
features = df[["status_code", "income", "job_code", "sexuality_code", "height", "body_code"]]
labels = df[["sex_code"]]
y = labels.values
x = features.values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
y_scaled = scaler.fit_transform(y)

features = pd.DataFrame(x_scaled, columns = features.columns)
labels = pd.DataFrame(y, columns = labels.columns)
features.dropna(inplace=True)
labels.dropna(inplace=True)

labels = labels.iloc[:46292]

print(len(features.index))

#Making data splits
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 1)
knn_scores = []
congress_approval = []
for k in range(1, 151):
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train.values.ravel())
    knn_scores.append(sheep.score(X_test, y_test.values.ravel()))
    #I call Random Forests Congress because they run on votes and every tree is confused
    congress = RandomForestClassifier(max_depth = k)
    congress.fit(X_train, y_train.values.ravel())
    congress_approval.append(congress.score(X_test, y_test.values.ravel()))
#Visualize best K/Max depth
plt.plot(range(1, 151), knn_scores)
plt.plot(range(1, 151), congress_approval)
plt.xlabel("k/max depth values")
plt.ylabel("Scores")
plt.show()
#Setting up actual models for analysis
#KNN
knn_classifier = KNeighborsClassifier(147)
knn_classifier.fit(X_train, y_train.values.ravel())
knn_predictions = sheep_classifier.predict(X_test)
knn_precision = precision_score(y_test, sheep_predictions)
knn_recall = recall_score(y_test, sheep_predictions)
knn_accuracy = accuracy_score(y_test, sheep_predictions)
#Random Forest
congress_classifier = RandomForestClassifier()
congress_classifier.fit(X_train, y_train.values.ravel())
congress_predictions = congress_classifier.predict(X_test)
congress_precision = precision_score(y_test, congress_predictions)
congress_recall = recall_score(y_test, congress_predictions)
congress_accuracy = accuracy_score(y_test, congress_predictions)
print(f'''KNN Precision: {sheep_precision}
KNN Recall: {sheep_recall}
KNN Accuracy: {sheep_accuracy}
RF Precision: {congress_precision}
RF Recall: {congress_recall}
RF Accuracy: {congress_accuracy}''')
df["diet_code"] = df.diet.map({"mostly anything": 0, "anything": 1, "strictly anything": 2,
                              "mostly vegetarian": 3, "mostly other": 4, "strictly vegetarian": 5,
                              "vegetarian": 6, "strictly other": 7, "mostly vegan": 8, "other": 9,
                              "strictly vegan": 10, "vegan": 11, "mostly kosher": 12, "mostly halal": 13,
                              "strictly halal": 14, "strictly kosher": 15, "halal": 16, "kosher": 17})
df["drug_code"] = df.drugs.map({"never": 0, "sometimes": 1, "often": 2})
#Normalize and adjust for regression
reg_features = df[["diet_code", "body_code", "drug_code", "sex_code"]]
reg_labels = df[["height"]]
rx = reg_features.values
ry = reg_labels.values
rx_scaled = scaler.fit_transform(rx)
ry_scaled = scaler.fit_transform(ry)

reg_features = pd.DataFrame(rx, columns = reg_features.columns)
reg_labels = pd.DataFrame(ry, columns = reg_labels.columns)
reg_features.dropna(inplace=True)
reg_labels.dropna(inplace=True)

reg_labels = reg_labels.iloc[:26340]

print(len(reg_features.index))

#Recreate training data for regression
X_train, X_test, y_train, y_test = train_test_split(reg_features, reg_labels, random_state = 1)
#K-Nearest Neighbors Regressor
regressor = KNeighborsRegressor()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='crimson')

plt.xlabel('Actual Height', fontsize=15)
plt.ylabel('Predicted Height', fontsize=15)
plt.title("KNeighborsRegressor Actual v. Predictions")

plt.show()
error = mean_absolute_error(y_test, y_pred)
sq_error = mean_squared_error(y_test, y_pred)
print(f'''Mean Absolute Error: {error}
Mean Squared Error: {sq_error}''')
#Logistic Regression Model
log_regressor = LogisticRegression(max_iter = 1000)
log_regressor.fit(X_train, y_train.values.ravel())

log_pred = log_regressor.predict(X_test)

plt.figure(figsize=(10,10))
plt.plot(y_test, log_pred, c='cyan')

plt.xlabel('Actual Height', fontsize=15)
plt.ylabel('Predicted Height', fontsize=15)
plt.title("LogisticRegression Actual v. Predictions")

plt.show()
#Statistics for Logistic Regression
log_error = mean_absolute_error(y_test, log_pred)
sq_log_error = mean_squared_error(y_test, log_pred)
print(f'''Mean Absolute Error: {log_error}
Mean Squared Error: {sq_log_error}''')