Our team is analyzing loan prediction data based on customer behavior to see which features have an impact on predicting who would be a likely defaulter (someone who cannot repay a loan). We initially performed some EDA on the overall dataset to understand some of the trends in the behavior and in specific features. Due to the large size of our initial train and test datasets (270,000 obs.), we chose to resample them by taking 10% of the original observations (20,000 obs.) and consequently splitting those into train and test sets using a 70/30 split respectively.
We then ran various machine learning models on our data to and found the one that would provide the best and most accurate predictions using the given/most important variables.
Build machine learning models to predict who are possible defaulters for the consumer loans product based on historic customer behavior and their background information such as income, age, professional experience, profession, whether married or single and etc.
Dataset from Kaggle Provided by Univ.AI for a Hackathon Source: https://www.kaggle.com/subhamjain/loan-prediction-based-on-customer-behavior?select=Training+Data.csv
library(data.table)
library(ggplot2)
library(ggthemes)
library(scales)
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.8.1
library(caTools)
library(ROCR)
library(rpart)
library(rpart.plot)
library(SmartEDA)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
theme_set(theme_bw())
data <- fread("./Training-Data.csv")
head(data,n=10)
str(data)
## Classes 'data.table' and 'data.frame': 252000 obs. of 13 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : int 1303834 7574516 3991815 6256451 5768871 6915937 3954973 1706172 7566849 8964846 ...
## $ Age : int 23 40 66 41 47 64 58 33 24 23 ...
## $ Experience : int 3 10 4 2 11 0 14 2 17 12 ...
## $ Married/Single : chr "single" "single" "married" "single" ...
## $ House_Ownership : chr "rented" "rented" "rented" "rented" ...
## $ Car_Ownership : chr "no" "no" "no" "yes" ...
## $ Profession : chr "Mechanical_engineer" "Software_Developer" "Technical_writer" "Software_Developer" ...
## $ CITY : chr "Rewa" "Parbhani" "Alappuzha" "Bhubaneswar" ...
## $ STATE : chr "Madhya_Pradesh" "Maharashtra" "Kerala" "Odisha" ...
## $ CURRENT_JOB_YRS : int 3 9 4 2 3 0 8 2 11 5 ...
## $ CURRENT_HOUSE_YRS: int 13 13 10 12 14 12 12 14 11 13 ...
## $ Risk_Flag : int 0 0 0 1 1 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
## Make sample dataset-1: Based on original dataset ratio
# Get Risk_Flag original dataset ratio
total <- length(data$Risk_Flag)
true <- length(data$Risk_Flag[data$Risk_Flag==1])
false <- length(data$Risk_Flag[data$Risk_Flag==0])
# Calculate dataset ratio
true_ratio <- true/total
false_ratio <- false/total
# Get sample dataset size for 0 and 1
true_smp_size <- floor(true_ratio * 20000)
false_smp_size <- floor(false_ratio * 20000)
# set seed
set.seed(123)
# Split dataset for Risk_Flag=1 and Risk_Flag=0
true_ind <- sample(seq_len(nrow(data[data$Risk_Flag==1])), size = true_smp_size)
false_ind <- sample(seq_len(nrow(data[data$Risk_Flag==0])), size = false_smp_size)
# Split the data frames
true_smp <- (data[data$Risk_Flag==1])[true_ind, ]
false_smp <- (data[data$Risk_Flag==0])[false_ind, ]
# Create sample dataset size
sample1 <- rbind(false_smp, true_smp)
# Randomize sample's order
sample1 <- sample1[sample(nrow(sample1)),]
data <- sample1
str(data)
## Classes 'data.table' and 'data.frame': 20000 obs. of 13 variables:
## $ Id : int 128148 251932 2974 113286 6353 74710 6475 161601 156002 119339 ...
## $ Income : int 5118418 9817339 4104566 2207354 4968665 3247184 5867312 4036960 5448587 1902583 ...
## $ Age : int 57 64 73 75 72 70 43 79 39 21 ...
## $ Experience : int 18 11 11 13 11 17 1 16 1 2 ...
## $ Married/Single : chr "single" "single" "married" "married" ...
## $ House_Ownership : chr "rented" "rented" "rented" "rented" ...
## $ Car_Ownership : chr "no" "yes" "yes" "no" ...
## $ Profession : chr "Statistician" "Mechanical_engineer" "Chemical_engineer" "Web_designer" ...
## $ CITY : chr "Phusro" "Adoni" "Uluberia" "Bhubaneswar" ...
## $ STATE : chr "Jharkhand" "Andhra_Pradesh" "West_Bengal" "Odisha" ...
## $ CURRENT_JOB_YRS : int 10 11 11 8 3 13 1 3 1 2 ...
## $ CURRENT_HOUSE_YRS: int 13 14 11 14 13 10 13 13 13 11 ...
## $ Risk_Flag : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
data$CITY <- as.data.frame(gsub("[[:punct:]]", "", as.matrix(data$CITY)))
data$STATE <- as.data.frame(gsub("[[:punct:]]", "", as.matrix(data$STATE)))
#REMOVE unwanted ID
data$Id <- NULL
#Change col names
colnames(data)[4] <- "Marital_Status"
str(data)
## Classes 'data.table' and 'data.frame': 20000 obs. of 12 variables:
## $ Income : int 5118418 9817339 4104566 2207354 4968665 3247184 5867312 4036960 5448587 1902583 ...
## $ Age : int 57 64 73 75 72 70 43 79 39 21 ...
## $ Experience : int 18 11 11 13 11 17 1 16 1 2 ...
## $ Marital_Status : chr "single" "single" "married" "married" ...
## $ House_Ownership : chr "rented" "rented" "rented" "rented" ...
## $ Car_Ownership : chr "no" "yes" "yes" "no" ...
## $ Profession : chr "Statistician" "Mechanical_engineer" "Chemical_engineer" "Web_designer" ...
## $ CITY : chr "Phusro" "Adoni" "Uluberia" "Bhubaneswar" ...
## $ STATE : chr "Jharkhand" "AndhraPradesh" "WestBengal" "Odisha" ...
## $ CURRENT_JOB_YRS : int 10 11 11 8 3 13 1 3 1 2 ...
## $ CURRENT_HOUSE_YRS: int 13 14 11 14 13 10 13 13 13 11 ...
## $ Risk_Flag : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(data)
## Income Age Experience Marital_Status
## Min. : 10310 Min. :21.00 Min. : 0.0 Length:20000
## 1st Qu.:2480208 1st Qu.:35.00 1st Qu.: 5.0 Class :character
## Median :4959830 Median :50.00 Median :10.0 Mode :character
## Mean :4971475 Mean :49.92 Mean :10.1
## 3rd Qu.:7449317 3rd Qu.:65.00 3rd Qu.:15.0
## Max. :9999180 Max. :79.00 Max. :20.0
## House_Ownership Car_Ownership Profession CITY
## Length:20000 Length:20000 Length:20000 Length:20000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## STATE CURRENT_JOB_YRS CURRENT_HOUSE_YRS Risk_Flag
## Length:20000 Min. : 0.000 Min. :10.00 Min. :0.000
## Class :character 1st Qu.: 4.000 1st Qu.:11.00 1st Qu.:0.000
## Mode :character Median : 6.000 Median :12.00 Median :0.000
## Mean : 6.384 Mean :11.99 Mean :0.123
## 3rd Qu.: 9.000 3rd Qu.:13.00 3rd Qu.:0.000
## Max. :14.000 Max. :14.00 Max. :1.000
# Overview of the data - Type = 1
ExpData(data=data,type=1)
# Structure of the data - Type = 2
ExpData(data=data,type=2, fun = c("mean", "median", "var"))
print(paste0("Minimum salary: ",min(data$Income)))
## [1] "Minimum salary: 10310"
print(paste0("Maximum salary: ",max(data$Income)))
## [1] "Maximum salary: 9999180"
salary_distribution <- ggplot(data, aes(data$Income, fill=..count..))+
geom_histogram(binwidth=1000000) +
labs(x="Income", y="Number") +
ggtitle("Frequency Distribution of Income")
salary_distribution
age_distribution <- ggplot(data, aes(data$Age, fill=..count..))+
geom_histogram(binwidth = 10,) +
labs(x="Age", y="Number") +
ggtitle("Frequency Distribution of Age")+
scale_x_continuous(breaks = seq(0,250,25))
age_distribution
rented_vs_owned <- data[, .(count = .N), by = House_Ownership]
print(rented_vs_owned)
## House_Ownership count
## 1: rented 18402
## 2: owned 1030
## 3: norent_noown 568
ggplot(rented_vs_owned, aes (x="", y = count, fill = House_Ownership)) +
geom_col(position = 'stack', width = 1) +
geom_text(aes(label = paste0(round(count / sum(count) * 100), "%"), x = 1.3),
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette="Blues")+
theme_classic() +
theme(plot.title = element_text(hjust=0.5),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(fill = "House Ownership",
x = NULL,
y = NULL,
title = "House Ownership") +
coord_polar("y")
car_ownership <- data[, .(count = .N), by = Car_Ownership]
print(car_ownership)
## Car_Ownership count
## 1: no 13888
## 2: yes 6112
ggplot(car_ownership, aes (x="", y = count, fill = Car_Ownership)) +
geom_col(position = 'stack', width = 1) +
geom_text(aes(label = paste0(round(count / sum(count) * 100), "%"), x = 1.3),
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette="Blues")+
theme_classic() +
theme(plot.title = element_text(hjust=0.5),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(fill = "Car Ownership",
x = NULL,
y = NULL,
title = "Car Ownership") +
coord_polar("y")
marital_status <- data[, .(count = .N), by = `Marital_Status`]
print(marital_status)
## Marital_Status count
## 1: single 17935
## 2: married 2065
ggplot(marital_status, aes (x="", y = count, fill = `Marital_Status`)) +
geom_col(position = 'stack', width = 1) +
geom_text(aes(label = paste0(round(count / sum(count) * 100), "%"), x = 1.3),
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette="Blues")+
theme_classic() +
theme(plot.title = element_text(hjust=0.5),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(fill = "Marital Status",
x = NULL,
y = NULL,
title = "Marital Status") +
coord_polar("y")
profession <- data[, .(count = .N), by = `Profession`][order(-rank(count))]
print(paste0("Disitinct Profession:", nrow(profession)))
## [1] "Disitinct Profession:51"
ggplot(data=head(profession,n=10), aes(x=reorder(Profession, count), y=count)) +
geom_bar(stat="identity")+
coord_flip()+
labs(title = 'Top 10 Profession') +
xlab("Profession")
#Cross tabulation with target variable
ExpCTable(data,Target="Risk_Flag",margin=1,clim=10,nlim=3,round=2,bin=NULL,per=F)
#Distributions of Numerical variables
plot1 <- ExpNumViz(data,target="Risk_Flag",type=1,nlim=5,fname=NULL,col=c("darkgreen","springgreen3","springgreen1"),Page=c(2,2),sample=5)
plot1[[1]]
#Distributions of categorical variables
plot2 <- ExpCatViz(data,target="Risk_Flag",fname=NULL,clim=15,col=c("slateblue4","slateblue1"),margin=2,Page = c(1,1),sample=5)
plot2[[1]]
# Supervised-Machine Learning Models
mse = function(x,y) { mean((x-y)^2)}
set.seed(123)
smp_size <- floor(0.70 * nrow(data))
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
x_data <- model.matrix( ~ -1 + Income + Age +
Experience + Marital_Status +
House_Ownership + Car_Ownership + CITY + STATE + Profession + CURRENT_JOB_YRS + CURRENT_HOUSE_YRS, data)
# outcome is median house value in millions
y_data <- data$Risk_Flag
x_train <- x_data[train_ind, ]
y_train <- y_data[train_ind]
x_test <- x_data[-train_ind, ]
y_test <- y_data[-train_ind]
x_train_df <- as.data.frame(x_train)
x_test_df <- as.data.frame(x_test)
# categorical train test split
x_rf_train <- data[train_ind, ]
x_rf_test <- data[-train_ind]
y_rf_train_sample <- x_rf_train$Risk_Flag
y_rf_test_sample <- x_rf_test$Risk_Flag
# Check test and train dataset
prop.table(table(y_train))
## y_train
## 0 1
## 0.8767857 0.1232143
prop.table(table(y_test))
## y_test
## 0 1
## 0.8775 0.1225
linear_model = lm(y_train ~. + 0 , data=x_train_df)
predictions_test_linear_df <- predict(linear_model, x_test_df)
## Warning in predict.lm(linear_model, x_test_df): prediction from a rank-deficient
## fit may be misleading
predictions_test_linear <- as.matrix(predictions_test_linear_df)
mse_test_lm1 <- mse(y_test, predictions_test_linear)
print(mse_test_lm1)
## [1] 0.1071842
ridge_model <- cv.glmnet(x_train, y=y_train, alpha = 0, nfolds = 5)
best_ridge_lambda <- ridge_model$lambda.min
best_model_ridge <- glmnet(x_train, y_train, alpha = 0, lambda = best_ridge_lambda)
#coef(best_model_ridge)
predictions_train_ridge <- predict(best_model_ridge, s = best_ridge_lambda, newx = x_train)
predictions_test_ridge <- predict(best_model_ridge, s = best_ridge_lambda, newx = x_test)
mse_test_ridge = (mse(predictions_test_ridge, y_test))
print(mse_test_ridge)
## [1] 0.1061019
lasso_model <- cv.glmnet(x_train, y=y_train, alpha = 1, nfolds = 5)
best_lasso_lambda <- lasso_model$lambda.min
best_model_lasso <- glmnet(x_train, y_train, alpha = 1, lambda = best_lasso_lambda)
#coef(best_model)
predictions_train_lasso <- predict(best_model_lasso, s = best_lasso_lambda, newx = x_train)
predictions_test_lasso <- predict(best_model_lasso, s = best_lasso_lambda, newx = x_test)
mse_test_lasso = (mse(predictions_test_lasso, y_test))
print(mse_test_lasso)
## [1] 0.1066492
log_model <- glm(y_train ~ ., data=x_train_df, family = "binomial")
predictions_test_log_df <- predict(log_model, x_test_df, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictions_test_log <- as.matrix(predictions_test_log_df)
mse_test_log <- mse(y_test, predictions_test_log_df)
print(mse_test_log)
## [1] 0.1072009
# machine learning - decision tree
tree_model <-rpart(Risk_Flag ~ ., data = x_rf_train, control=rpart.control(cp=0.003))
yhat4.tree <- predict(tree_model, x_rf_test)
mse_test_dtress <- mse(y_rf_test_sample, yhat4.tree)
print(mse_test_dtress)
## [1] 0.1074425
rpart.plot(tree_model, type = 1)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Run randomForest model
risk_rf <- randomForest(Risk_Flag ~ ., data = x_rf_train, importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
risk_rf_pred <- predict(risk_rf, x_rf_test)
# Calculate MSE for random forest
mse_test_forest <- mse(y_rf_test_sample, risk_rf_pred)
print(mse_test_forest)
## [1] 0.0783891
fit.btree <- gbm(y_train ~.,
data = x_train_df,
distribution = "gaussian",
n.trees = 200,
interaction.depth = 2,
shrinkage = 0.001)
yhat.btree <- predict(fit.btree, x_test_df, n.trees = 100)
predictions_test_xg <- as.matrix(yhat.btree)
mse_test_gbmtress <- mse(y_test , predictions_test_xg)
print(mse_test_gbmtress)
## [1] 0.1074451
# Create the data for the chart
MSE <- c(mse_test_forest, mse_test_ridge, mse_test_lasso, mse_test_lm1, mse_test_log, mse_test_dtress, mse_test_gbmtress)
Model <- c("Random Forest","Ridge Regression","Lasso Regression","Linear Regression","Logistic Regression","Decision Tree", "Generalized Boosted Regression")
mse.data <- data.frame(Model,MSE)
mse.data
importance(risk_rf)
## %IncMSE IncNodePurity
## Income 108.44322 217.96869
## Age 105.92672 167.42525
## Experience 89.14577 101.66772
## Marital_Status 37.53172 16.27763
## House_Ownership 30.02962 14.43837
## Car_Ownership 34.07454 23.28329
## Profession 102.51189 159.43771
## CITY 88.12355 187.92901
## STATE 90.84689 114.35082
## CURRENT_JOB_YRS 76.28996 94.56582
## CURRENT_HOUSE_YRS 76.20859 70.93637
varImpPlot(risk_rf)
## Higher the value of mean decrease accuracy or mean decrease gini score , higher the importance of the variable in the model. In the plot shown above, Income is most important variable.