Executive Summary

Our team is analyzing loan prediction data based on customer behavior to see which features have an impact on predicting who would be a likely defaulter (someone who cannot repay a loan). We initially performed some EDA on the overall dataset to understand some of the trends in the behavior and in specific features. Due to the large size of our initial train and test datasets (270,000 obs.), we chose to resample them by taking 10% of the original observations (20,000 obs.) and consequently splitting those into train and test sets using a 70/30 split respectively.

We then ran various machine learning models on our data to and found the one that would provide the best and most accurate predictions using the given/most important variables.

Problem Definition

Build machine learning models to predict who are possible defaulters for the consumer loans product based on historic customer behavior and their background information such as income, age, professional experience, profession, whether married or single and etc.

Data Source

Dataset from Kaggle Provided by Univ.AI for a Hackathon Source: https://www.kaggle.com/subhamjain/loan-prediction-based-on-customer-behavior?select=Training+Data.csv

Setup

Installing Packages

Loading Packages to Library

library(data.table)
library(ggplot2)
library(ggthemes)
library(scales)
library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.8.1
library(caTools)
library(ROCR)
library(rpart)
library(rpart.plot)
library(SmartEDA)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
theme_set(theme_bw())

Loading dataset into the environment

data <- fread("./Training-Data.csv")
head(data,n=10)
str(data)
## Classes 'data.table' and 'data.frame':   252000 obs. of  13 variables:
##  $ Id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Income           : int  1303834 7574516 3991815 6256451 5768871 6915937 3954973 1706172 7566849 8964846 ...
##  $ Age              : int  23 40 66 41 47 64 58 33 24 23 ...
##  $ Experience       : int  3 10 4 2 11 0 14 2 17 12 ...
##  $ Married/Single   : chr  "single" "single" "married" "single" ...
##  $ House_Ownership  : chr  "rented" "rented" "rented" "rented" ...
##  $ Car_Ownership    : chr  "no" "no" "no" "yes" ...
##  $ Profession       : chr  "Mechanical_engineer" "Software_Developer" "Technical_writer" "Software_Developer" ...
##  $ CITY             : chr  "Rewa" "Parbhani" "Alappuzha" "Bhubaneswar" ...
##  $ STATE            : chr  "Madhya_Pradesh" "Maharashtra" "Kerala" "Odisha" ...
##  $ CURRENT_JOB_YRS  : int  3 9 4 2 3 0 8 2 11 5 ...
##  $ CURRENT_HOUSE_YRS: int  13 13 10 12 14 12 12 14 11 13 ...
##  $ Risk_Flag        : int  0 0 0 1 1 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Re-sampling

## Make sample dataset-1: Based on original dataset ratio
# Get Risk_Flag original dataset ratio
total <- length(data$Risk_Flag)
true <- length(data$Risk_Flag[data$Risk_Flag==1])
false <- length(data$Risk_Flag[data$Risk_Flag==0])

# Calculate dataset ratio
true_ratio <- true/total
false_ratio <- false/total
# Get sample dataset size for 0 and 1
true_smp_size <- floor(true_ratio * 20000)
false_smp_size <- floor(false_ratio * 20000)
# set seed
set.seed(123)
# Split dataset for Risk_Flag=1 and Risk_Flag=0
true_ind <- sample(seq_len(nrow(data[data$Risk_Flag==1])), size = true_smp_size)
false_ind <- sample(seq_len(nrow(data[data$Risk_Flag==0])), size = false_smp_size)
# Split the data frames
true_smp <- (data[data$Risk_Flag==1])[true_ind, ]
false_smp <- (data[data$Risk_Flag==0])[false_ind, ]
# Create sample dataset size
sample1 <- rbind(false_smp, true_smp)
# Randomize sample's order
sample1 <- sample1[sample(nrow(sample1)),]

data <- sample1
str(data)
## Classes 'data.table' and 'data.frame':   20000 obs. of  13 variables:
##  $ Id               : int  128148 251932 2974 113286 6353 74710 6475 161601 156002 119339 ...
##  $ Income           : int  5118418 9817339 4104566 2207354 4968665 3247184 5867312 4036960 5448587 1902583 ...
##  $ Age              : int  57 64 73 75 72 70 43 79 39 21 ...
##  $ Experience       : int  18 11 11 13 11 17 1 16 1 2 ...
##  $ Married/Single   : chr  "single" "single" "married" "married" ...
##  $ House_Ownership  : chr  "rented" "rented" "rented" "rented" ...
##  $ Car_Ownership    : chr  "no" "yes" "yes" "no" ...
##  $ Profession       : chr  "Statistician" "Mechanical_engineer" "Chemical_engineer" "Web_designer" ...
##  $ CITY             : chr  "Phusro" "Adoni" "Uluberia" "Bhubaneswar" ...
##  $ STATE            : chr  "Jharkhand" "Andhra_Pradesh" "West_Bengal" "Odisha" ...
##  $ CURRENT_JOB_YRS  : int  10 11 11 8 3 13 1 3 1 2 ...
##  $ CURRENT_HOUSE_YRS: int  13 14 11 14 13 10 13 13 13 11 ...
##  $ Risk_Flag        : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Data Cleaning

data$CITY <- as.data.frame(gsub("[[:punct:]]", "", as.matrix(data$CITY)))
data$STATE <- as.data.frame(gsub("[[:punct:]]", "", as.matrix(data$STATE))) 

#REMOVE unwanted ID
data$Id <- NULL

#Change col names
colnames(data)[4] <- "Marital_Status"

str(data)
## Classes 'data.table' and 'data.frame':   20000 obs. of  12 variables:
##  $ Income           : int  5118418 9817339 4104566 2207354 4968665 3247184 5867312 4036960 5448587 1902583 ...
##  $ Age              : int  57 64 73 75 72 70 43 79 39 21 ...
##  $ Experience       : int  18 11 11 13 11 17 1 16 1 2 ...
##  $ Marital_Status   : chr  "single" "single" "married" "married" ...
##  $ House_Ownership  : chr  "rented" "rented" "rented" "rented" ...
##  $ Car_Ownership    : chr  "no" "yes" "yes" "no" ...
##  $ Profession       : chr  "Statistician" "Mechanical_engineer" "Chemical_engineer" "Web_designer" ...
##  $ CITY             : chr  "Phusro" "Adoni" "Uluberia" "Bhubaneswar" ...
##  $ STATE            : chr  "Jharkhand" "AndhraPradesh" "WestBengal" "Odisha" ...
##  $ CURRENT_JOB_YRS  : int  10 11 11 8 3 13 1 3 1 2 ...
##  $ CURRENT_HOUSE_YRS: int  13 14 11 14 13 10 13 13 13 11 ...
##  $ Risk_Flag        : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(data)
##      Income             Age          Experience   Marital_Status    
##  Min.   :  10310   Min.   :21.00   Min.   : 0.0   Length:20000      
##  1st Qu.:2480208   1st Qu.:35.00   1st Qu.: 5.0   Class :character  
##  Median :4959830   Median :50.00   Median :10.0   Mode  :character  
##  Mean   :4971475   Mean   :49.92   Mean   :10.1                     
##  3rd Qu.:7449317   3rd Qu.:65.00   3rd Qu.:15.0                     
##  Max.   :9999180   Max.   :79.00   Max.   :20.0                     
##  House_Ownership    Car_Ownership       Profession            CITY          
##  Length:20000       Length:20000       Length:20000       Length:20000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     STATE           CURRENT_JOB_YRS  CURRENT_HOUSE_YRS   Risk_Flag    
##  Length:20000       Min.   : 0.000   Min.   :10.00     Min.   :0.000  
##  Class :character   1st Qu.: 4.000   1st Qu.:11.00     1st Qu.:0.000  
##  Mode  :character   Median : 6.000   Median :12.00     Median :0.000  
##                     Mean   : 6.384   Mean   :11.99     Mean   :0.123  
##                     3rd Qu.: 9.000   3rd Qu.:13.00     3rd Qu.:0.000  
##                     Max.   :14.000   Max.   :14.00     Max.   :1.000

Exploratory Data Analysis

General Analysis - Overview

# Overview of the data - Type = 1
ExpData(data=data,type=1)
# Structure of the data - Type = 2
ExpData(data=data,type=2, fun = c("mean", "median", "var"))

EDA: Income

print(paste0("Minimum salary: ",min(data$Income)))
## [1] "Minimum salary: 10310"
print(paste0("Maximum salary: ",max(data$Income)))
## [1] "Maximum salary: 9999180"
salary_distribution <- ggplot(data, aes(data$Income, fill=..count..))+
  geom_histogram(binwidth=1000000) + 
  labs(x="Income", y="Number") +
  ggtitle("Frequency Distribution of Income")
  
salary_distribution

EDA: Age

age_distribution <- ggplot(data, aes(data$Age, fill=..count..))+
  geom_histogram(binwidth = 10,) + 
  labs(x="Age", y="Number") +
  ggtitle("Frequency Distribution of Age")+
  scale_x_continuous(breaks = seq(0,250,25))
  
age_distribution

EDA: House Ownership

rented_vs_owned <- data[, .(count = .N), by = House_Ownership]
print(rented_vs_owned)
##    House_Ownership count
## 1:          rented 18402
## 2:           owned  1030
## 3:    norent_noown   568
ggplot(rented_vs_owned, aes (x="", y = count, fill = House_Ownership)) + 
  geom_col(position = 'stack', width = 1) +
  geom_text(aes(label = paste0(round(count / sum(count) * 100), "%"), x = 1.3),
            position = position_stack(vjust = 0.5)) +
  scale_fill_brewer(palette="Blues")+
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5),
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(fill = "House Ownership",
       x = NULL,
       y = NULL,
       title = "House Ownership") + 
  coord_polar("y")

EDA: Car Ownership

car_ownership <- data[, .(count = .N), by = Car_Ownership]
print(car_ownership)
##    Car_Ownership count
## 1:            no 13888
## 2:           yes  6112
ggplot(car_ownership, aes (x="", y = count, fill = Car_Ownership)) + 
  geom_col(position = 'stack', width = 1) +
  geom_text(aes(label = paste0(round(count / sum(count) * 100), "%"), x = 1.3),
            position = position_stack(vjust = 0.5)) +
  scale_fill_brewer(palette="Blues")+
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5),
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(fill = "Car Ownership",
       x = NULL,
       y = NULL,
       title = "Car Ownership") + 
  coord_polar("y")

EDA: Marital Status

marital_status <- data[, .(count = .N), by = `Marital_Status`]
print(marital_status)
##    Marital_Status count
## 1:         single 17935
## 2:        married  2065
ggplot(marital_status, aes (x="", y = count, fill = `Marital_Status`)) + 
  geom_col(position = 'stack', width = 1) +
  geom_text(aes(label = paste0(round(count / sum(count) * 100), "%"), x = 1.3),
            position = position_stack(vjust = 0.5)) +
  scale_fill_brewer(palette="Blues")+
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5),
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(fill = "Marital Status",
       x = NULL,
       y = NULL,
       title = "Marital Status") + 
  coord_polar("y")

EDA: Profession

profession <- data[, .(count = .N), by = `Profession`][order(-rank(count))]
print(paste0("Disitinct Profession:", nrow(profession)))
## [1] "Disitinct Profession:51"
ggplot(data=head(profession,n=10), aes(x=reorder(Profession, count), y=count)) +
  geom_bar(stat="identity")+
  coord_flip()+
  labs(title = 'Top 10 Profession') +
  xlab("Profession")

EDA: Target Variable: “Risk_Flag”

#Cross tabulation with target variable
ExpCTable(data,Target="Risk_Flag",margin=1,clim=10,nlim=3,round=2,bin=NULL,per=F)
#Distributions of Numerical variables
plot1 <- ExpNumViz(data,target="Risk_Flag",type=1,nlim=5,fname=NULL,col=c("darkgreen","springgreen3","springgreen1"),Page=c(2,2),sample=5)
plot1[[1]]

#Distributions of categorical variables
plot2 <- ExpCatViz(data,target="Risk_Flag",fname=NULL,clim=15,col=c("slateblue4","slateblue1"),margin=2,Page = c(1,1),sample=5)
plot2[[1]]

# Supervised-Machine Learning Models

Helper Method

mse = function(x,y) { mean((x-y)^2)}

Train-Test Split

set.seed(123)
smp_size <- floor(0.70 * nrow(data))
train_ind <- sample(seq_len(nrow(data)), size = smp_size)

x_data <- model.matrix( ~ -1 + Income + Age +
 Experience + Marital_Status +
 House_Ownership + Car_Ownership + CITY + STATE + Profession + CURRENT_JOB_YRS + CURRENT_HOUSE_YRS, data)

# outcome is median house value in millions
y_data <- data$Risk_Flag

x_train <- x_data[train_ind, ]
y_train <- y_data[train_ind]
x_test <- x_data[-train_ind, ]
y_test <- y_data[-train_ind]

x_train_df <- as.data.frame(x_train)
x_test_df <- as.data.frame(x_test)


# categorical train test split
x_rf_train <- data[train_ind, ]
x_rf_test <- data[-train_ind]

y_rf_train_sample <- x_rf_train$Risk_Flag
y_rf_test_sample <- x_rf_test$Risk_Flag


# Check test and train dataset
prop.table(table(y_train))
## y_train
##         0         1 
## 0.8767857 0.1232143
prop.table(table(y_test))
## y_test
##      0      1 
## 0.8775 0.1225

Linear Regression

linear_model = lm(y_train ~. + 0 , data=x_train_df)

predictions_test_linear_df <- predict(linear_model, x_test_df)
## Warning in predict.lm(linear_model, x_test_df): prediction from a rank-deficient
## fit may be misleading
predictions_test_linear <- as.matrix(predictions_test_linear_df)

mse_test_lm1 <- mse(y_test, predictions_test_linear)

print(mse_test_lm1)
## [1] 0.1071842

Ridge Regression

ridge_model <- cv.glmnet(x_train, y=y_train, alpha = 0, nfolds = 5)
best_ridge_lambda <- ridge_model$lambda.min


best_model_ridge <- glmnet(x_train, y_train, alpha = 0, lambda = best_ridge_lambda)
#coef(best_model_ridge)

predictions_train_ridge <- predict(best_model_ridge, s = best_ridge_lambda, newx = x_train)
predictions_test_ridge <- predict(best_model_ridge, s = best_ridge_lambda, newx = x_test)

mse_test_ridge = (mse(predictions_test_ridge, y_test))

print(mse_test_ridge)
## [1] 0.1061019

Lasso Regression

lasso_model <- cv.glmnet(x_train, y=y_train, alpha = 1, nfolds = 5)
best_lasso_lambda <- lasso_model$lambda.min


best_model_lasso <- glmnet(x_train, y_train, alpha = 1, lambda = best_lasso_lambda)
#coef(best_model)

predictions_train_lasso <- predict(best_model_lasso, s = best_lasso_lambda, newx = x_train)
predictions_test_lasso <- predict(best_model_lasso, s = best_lasso_lambda, newx = x_test)

mse_test_lasso = (mse(predictions_test_lasso, y_test))

print(mse_test_lasso)
## [1] 0.1066492

Logistic regression

log_model <- glm(y_train ~ ., data=x_train_df, family = "binomial")

predictions_test_log_df <- predict(log_model, x_test_df, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictions_test_log <- as.matrix(predictions_test_log_df)

mse_test_log <- mse(y_test, predictions_test_log_df)

print(mse_test_log)
## [1] 0.1072009

Decision Tree

# machine learning - decision tree

tree_model <-rpart(Risk_Flag ~ ., data = x_rf_train, control=rpart.control(cp=0.003))
yhat4.tree <- predict(tree_model, x_rf_test)

mse_test_dtress <- mse(y_rf_test_sample, yhat4.tree)
print(mse_test_dtress)
## [1] 0.1074425
rpart.plot(tree_model, type = 1)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

Random Forest

# Run randomForest model
risk_rf <- randomForest(Risk_Flag ~ ., data = x_rf_train, importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
risk_rf_pred <- predict(risk_rf, x_rf_test)

# Calculate MSE for random forest
mse_test_forest <- mse(y_rf_test_sample, risk_rf_pred)
print(mse_test_forest)
## [1] 0.0783891

Generalized Boosted Regression

fit.btree <- gbm(y_train ~.,
data = x_train_df,
distribution = "gaussian",
n.trees = 200,
interaction.depth = 2,
shrinkage = 0.001)

yhat.btree <- predict(fit.btree, x_test_df, n.trees = 100)
predictions_test_xg <- as.matrix(yhat.btree)

mse_test_gbmtress <- mse(y_test , predictions_test_xg)
print(mse_test_gbmtress)
## [1] 0.1074451

Conclusion

Compare all model MSE

# Create the data for the chart
MSE <- c(mse_test_forest, mse_test_ridge, mse_test_lasso, mse_test_lm1, mse_test_log, mse_test_dtress, mse_test_gbmtress)
Model <- c("Random Forest","Ridge Regression","Lasso Regression","Linear Regression","Logistic Regression","Decision Tree", "Generalized Boosted Regression")

mse.data <- data.frame(Model,MSE)
mse.data

Get important variables

importance(risk_rf)
##                     %IncMSE IncNodePurity
## Income            108.44322     217.96869
## Age               105.92672     167.42525
## Experience         89.14577     101.66772
## Marital_Status     37.53172      16.27763
## House_Ownership    30.02962      14.43837
## Car_Ownership      34.07454      23.28329
## Profession        102.51189     159.43771
## CITY               88.12355     187.92901
## STATE              90.84689     114.35082
## CURRENT_JOB_YRS    76.28996      94.56582
## CURRENT_HOUSE_YRS  76.20859      70.93637
varImpPlot(risk_rf)

## Higher the value of mean decrease accuracy or mean decrease gini score , higher the importance of the variable in the model. In the plot shown above, Income is most important variable.