Hi!
I just finalised my code for the “Predicting Credit Card Fraud” on the Data Science: Machine Learning Specialist skillpath, but my model score and predictions don’t add up (…seems to good). I have tried to identify where my error is, but I can’t figure it out. I’ve enclosed my code and would love some help (…and feedback!).
import seaborn
import pandas as pd
import numpy as np
import codecademylib3
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load the data
transactions = pd.read_csv('transactions.csv')
print(transactions.head(10))
print(transactions.info)
# Summary statistics on amount column
print(transactions['amount'].describe())
# Create isPayment field
def cond_isPayment(x):
if x == 'PAYMENT':
return 1
elif x == 'DEBIT':
return 1
else:
return 0
func = np.vectorize(cond_isPayment)
isPayment = func(transactions['type'])
transactions['isPayment'] = isPayment
print(transactions['isPayment'].head(10))
# Create isMovement field
def cond_isMovement(x):
if x == 'CASH_OUT':
return 1
elif x == 'TRANSFER':
return 1
else:
return 0
func = np.vectorize(cond_isMovement)
isMovement = func(transactions['type'])
transactions['isMovement'] = isMovement
print(transactions['isMovement'].head(5))
# Create accountDiff field
transactions['accountDiff'] = transactions['oldbalanceOrg'] - transactions['oldbalanceDest']
print(transactions['accountDiff'].head(5))
# Create features and label variables
X = transactions[['amount', 'isPayment', 'isMovement', 'accountDiff']]
print(X.head(5))
y = transactions['isFraud']
print(y.head(5))
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
# Normalize the features variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Fit the model to the training data
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
# Score the model on the training data
print(lr.score(X_train_scaled, y_train))
# Score the model on the test data
print(lr.score(X_test_scaled, y_test))
# Print the model coefficients
print(lr.coef_)
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])
# Combine new transactions into a single array
sample_transactions = np.array([transaction1, transaction2, transaction3])
print(sample_transactions)
# Normalize the new transactions
sample_transactions_scaled = scaler.fit_transform(sample_transactions)
# Predict fraud on the new transactions
predicted_fraud = lr.predict(sample_transactions_scaled)
print(predicted_fraud)
# Show probabilities on the new transactions
predicted_prob_fraud = lr.predict_proba(sample_transactions_scaled)
print(predicted_prob_fraud)