Hello, my Data Science friends!
Posting my 1-st approach to 1-st portfolio project in our career path for your likes and critical reviews.
Feel free to share any of your thoughts. Thanks)
U.S. Medical Insurance Costs
Project Goals:
- Open and transform CSV file into Python lists
- Find average insurance cost
- Find average age of the patients
- Find the difference between smokers and no-smokers
- Find linear regression equations for age/smoking status/bmi/sex and insurance costs that produce the least error
- Output results
#1. Open and transform CSV file into Python lists
import csv
ages = []
sex = []
bmis = []
num_children = []
smoker_status = []
regions = []
insurance_costs = []
with open('insurance.csv') as insurance_csv:
reader = csv.reader(insurance_csv)
data = list(reader)
for row in data:
if row[0] == 'age':
continue
ages.append(row[0])
sex.append(row[1])
bmis.append(row[2])
num_children.append(row[3])
smoker_status.append(row[4])
regions.append(row[5])
insurance_costs.append(row[6])
#2. Find average insurance cost
def find_average_cost(costs_list):
total_cost = 0
for cost in costs_list:
total_cost += float(cost)
average_cost = total_cost / len(costs_list)
return average_cost
avg_cost = round(find_average_cost(insurance_costs), 2)
print("Average insurance cost is " + str(avg_cost) + " dollars.")
#Average insurance cost is 13270.42 dollars.
#3. Find average age of patients
def find_average_age(ages_list):
total_age = 0
for age in ages_list:
total_age += int(age)
average_age = total_age / len(ages_list)
return(average_age)
avg_age = round(find_average_age(ages))
print("Average age of patients is " + str(avg_age) + " years.")
#Average age of patients is 39 years.
#4. Find the difference in cost for smokers and non-smokers
def find_difference_smokers(smoker_list, costs_list):
total_cost_smoker = 0
count_smoker = 0
total_cost_nonsmoker = 0
count_nonsmoker = 0
for i in range(len(smoker_list)):
if smoker_list[i] == 'yes':
total_cost_smoker += float(costs_list[i])
count_smoker += 1
else:
total_cost_nonsmoker += float(costs_list[i])
count_nonsmoker += 1
avg_cost_smoker = total_cost_smoker / count_smoker
avg_cost_nonsmoker = total_cost_nonsmoker / count_nonsmoker
diff = round((avg_cost_smoker - avg_cost_nonsmoker), 2)
if diff > 0:
result = "Smokers pay " + str(diff) + "$ more than non-smokers on average."
elif diff < 0:
result = "Non-smokers pay " + str(diff) + "$ more than smokers on average.(?!)"
else:
result = "Smokers and non-smokers have absolutely no difference in insurance costs on average!"
return [str(diff), result]
find_diff = find_difference_smokers(smoker_status, insurance_costs)[1]
print(find_diff)
#Smokers pay 23615.96$ more than non-smokers on average.
#5. Find linear regression equations for age/bmi and insurance costs that produce the least error
# Define single error function
def find_single_error(m, b, xy):
x = xy[0]
y = xy[1]
single_error = m * x + b - y
return single_error
print(find_single_error(2, 5, (3, 10)))
#Define total error function
def find_total_error(m, b, data):
total = 0
for cost in data:
total += float(find_single_error(m, b, cost))
return total
#Define find "best linear function" function
def find_best_function(m_list, b_list, data):
best_total = (100000)
best_m = 0
best_b = 0
for m in m_list:
for b in b_list:
example_error = abs(find_total_error(m, b, data))
if example_error < best_total:
best_total = example_error
best_m = m
best_b = b
#print("Best function is: " + str(best_m) + "x + " + str(best_b) + " = y, with total error of " + str(best_total))
result = [best_m, best_b, best_total]
return result
#Define range of variables
insurance_costs_float = [round(float(cost), 2) for cost in insurance_costs]
ages_float = [round(float(age), 2) for age in ages]
bmis_float = [round(float(bmi), 2) for bmi in bmis]
children_float = [float(children) for children in num_children]
def find_average(list):
total = 0
for item in list:
total += float(item)
avg = total/len(list)
return avg
#Find ranges and averages of the variables
avg_cost = find_average(insurance_costs_float)
avg_age = find_average(ages)
avg_bmi = find_average(bmis)
print("Min-max costs: " + str(min(insurance_costs_float)) + "-" + str(max(insurance_costs_float)) + "$, average cost is " + str(round(avg_cost, 2)))
print("Min-max ages: " + str(min(ages)) + "-" + str(max(ages)) + ", average age is " + str(round(avg_age,2)))
print("Min-max bmis: " + str(min(bmis)) + "-" + str(max(bmis)) + ", average bmi is " + str(round(avg_bmi,2)))
ms = [i for i in range(200, 300)]
bs = [i for i in range(1000, 6000, 50)]
#make data lists for age-cost and bmi-cost in [(age, cost), (age, cost)] format
age_cost_zip = list(zip(ages_float, insurance_costs_float))
bmi_cost_zip = list(zip(bmis_float, insurance_costs_float))
best_function_age = find_best_function(ms, bs, age_cost_zip)
best_function_bmi = find_best_function(ms, bs, bmi_cost_zip)
print("I found that finding linear regression requires a lot of computing power, that's the reason why following results are far from being accurate. Anyway, we can change ms and bs formulas to include more instances, because the logic behind resulting function is fine.")
print("Age function is: " + str(best_function_age[0]) + "x + " + str(best_function_age[1]) + " = y, with total_error of " + str(best_function_age[2]))
print("Bmi function is: " + str(best_function_bmi[0]) + "x + " + str(best_function_bmi[1]) + " = y, with total_error of " + str(best_function_bmi[2]))
print("Linear regression for children, sex and smoking status is irrelevant, because total cost varies too much for too tiny number of variables")
#6. Output results
#2. Find average insurance cost
print("Average insurance cost for target dataset is " + str(round(avg_cost,2)) + "$.")
#3. Find average age of the patients
print("Average age of the patients is " + str(round(avg_age, 2)))
#4. Find the difference between smokers and no-smokers
print(find_difference_smokers(smoker_status, insurance_costs)[1])
#5. Find linear regression equations for age/smoking status/bmi/sex and insurance costs that produce the least error
print("Best age function is: " + str(best_function_age[0]) + "x + " + str(best_function_age[1]) + " = y, with total_error of " + str(best_function_age[2]))
print("Best bmi function is: " + str(best_function_bmi[0]) + "x + " + str(best_function_bmi[1]) + " = y, with total_error of " + str(best_function_bmi[2]))
print("Linear regression for children, sex and smoking status is irrelevant, because total cost varies too much for too tiny number of variables")
print("""We found, that:
1. Smokers pay much higher insurance premium than non-smokers, because their health risks are much higher.
2. Bmi and age have nearly the same influence on insurance premium. The difference is that bmi is a factor we can control, while age isn't.""")
#7. Organize into dictionaries
results = {}
results["Average cost"] = round(avg_cost, 2)
results["Average age"] = round(avg_age, 2)
results["Smoking penalty"] = find_difference_smokers(smoker_status, insurance_costs)[0]
results["Bmi linear regression"] = best_function_bmi[:2]
results["Age linear regression"] = best_function_age[:2]
print(results)
Good luck to all of us)