Credit Card Fraud Detection — XGBoost Classifier
3 min readMar 3, 2022
Dataset link: https://www.kaggle.com/mlg-ulb/creditcardfraud
Project link: https://github.com/ParisRohan/DataScience_Projects/blob/main/credit-card-fraud-detection.ipynb
df_data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df_data.head()
#Get count of unique values from the DEPENDENT feature
df_data['Class'].value_counts()
#Get count of missing values in each column
def get_cols_with_missing_values(DataFrame):
missing_na_columns=(DataFrame.isnull().sum())
return missing_na_columns[missing_na_columns > 0]
print(get_cols_with_missing_values(df_data))
We can see that the dataset has no null values
df_data.info()
There are no categorical features
X = df_data.loc[:, df_data.columns!='Class']
y = df_data['Class']
# Break off validation set from training data
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
Now we will try to optimize the model
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost#Hyper parameter optimization
params={
"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
"max_depth": [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
"min_child_weight": [1,3,5,7,9],
"gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
"colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7]
}
classifier=xgboost.XGBClassifier()random_search=RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, n_jobs=-1, cv=5,verbose=0)random_search.fit(X_train, y_train)random_search.best_estimator_
Using RandomizedSearchCV we found that the above parameters are ideal for our model
#Build a model using the optimized hyperparametersclassifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.4,
enable_categorical=False, gamma=0.1, gpu_id=-1,
importance_type=None, interaction_constraints='',
learning_rate=0.3, max_delta_step=0, max_depth=20,
min_child_weight=7, monotone_constraints='()',
n_estimators=100, n_jobs=4, num_parallel_tree=1, predictor='auto',
random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
subsample=1, tree_method='exact', validate_parameters=1,
verbosity=None)classifier.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_valid)
print(confusion_matrix(y_valid, y_pred))
True Negative = 85290
True Positive = 112
False Positive = 6
False Negative = 35
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))