Credit Card Fraud Detection — XGBoost Classifier

3 min readMar 3, 2022

Dataset link: https://www.kaggle.com/mlg-ulb/creditcardfraud

Project link: https://github.com/ParisRohan/DataScience_Projects/blob/main/credit-card-fraud-detection.ipynb

df_data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df_data.head()

#Get count of unique values from the DEPENDENT feature

df_data['Class'].value_counts()

#Get count of missing values in each column
def get_cols_with_missing_values(DataFrame):
    missing_na_columns=(DataFrame.isnull().sum())
    return missing_na_columns[missing_na_columns > 0]

print(get_cols_with_missing_values(df_data))

We can see that the dataset has no null values

df_data.info()

There are no categorical features

X = df_data.loc[:, df_data.columns!='Class']
y = df_data['Class']

# Break off validation set from training data
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

Now we will try to optimize the model

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost#Hyper parameter optimization
params={
   "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    "min_child_weight": [1,3,5,7,9],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7]
}
classifier=xgboost.XGBClassifier()random_search=RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, n_jobs=-1, cv=5,verbose=0)random_search.fit(X_train, y_train)random_search.best_estimator_

Using RandomizedSearchCV we found that the above parameters are ideal for our model

#Build a model using the optimized hyperparametersclassifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4,
              enable_categorical=False, gamma=0.1, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=20,
              min_child_weight=7, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_valid)
print(confusion_matrix(y_valid, y_pred))

True Negative = 85290

True Positive = 112

False Positive = 6

False Negative = 35

from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_valid, y_pred))

print(classification_report(y_valid, y_pred))

Credit Card Fraud Detection — XGBoost Classifier

#Get count of unique values from the DEPENDENT feature

Written by Rohan Paris

No responses yet