First, let's import all the dependencies.
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import re
import datetime
import sklearn as sk
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cross_validation import train_test_split, KFold, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
pd.options.display.max_columns=200
matplotlib.rcParams['figure.figsize'] = (15.0, 12.0)
summaryfile_type = {'Unnamed':int,
'Unnamed2':int,
'LOAN_ID':object,
'ORIG_CHN':object,
'Seller.Name':object,
'ORIG_RT':float,
'ORIG_AMT':float,
'ORIG_TRM':float,
'ORIG_DTE':object,
'FRST_DTE':object,
'OLTV':float,
'OCLTV':float,
'NUM_BO':float,
'DTI':float,
'CSCORE_B':float,
'FTHB_FLG':object,
'PURPOSE':object,
'PROP_TYP':object,
'NUM_UNIT':int,
'OCC_STAT':object,
'STATE':object,
'ZIP_3':object,
'MI_PCT':float,
'Product.Type':object,
'CSCORE_C':float,
'Monthly.Rpt.Prd':object,
'Servicer.Name':object,
'LAST_RT':float,
'LAST_UPB':float,
'Loan.Age':float,
'Months.To.Legal.Mat':float,
'Adj.Month.To.Mat':float,
'Maturity.Date':object,
'MSA':int,
'Delq.Status':object,
'MOD_FLAG':object,
'Zero.Bal.Code':float,
'ZB_DTE':object,
'LPI_DTE':object,
'FCC_DTE':object,
'DISP_DT':object
}
fannie1 = pd.read_csv('./processed/total/total_2012.csv', dtype=summaryfile_type, nrows=9000000)
First, we replace the column name having '.' to underline just because we are using Python. Then, we drop rows that have NA in either OLTV, OCLTV, DTI or CSCORE_B columns. Also, all loan with "current" status should be considered as tempororarily OK, as their final status is unknown. Because of that, for our machine learning, we just use a subset of the data containing those loan with status "fully paid" and "default".
fannie1.drop(fannie1.columns[:1], axis=1, inplace=True)
fannie1.rename(columns=lambda x: re.sub('[.]','_',x), inplace=True)
fannie1.head()
fannie1_filtered = fannie1.dropna(subset=('OLTV', 'OCLTV', 'DTI', 'CSCORE_B'))
fannie1_known = fannie1_filtered[fannie1_filtered['Zero_Bal_Code'] > 0]
As we already see in the exploratory data analysis, loan performance should consider within each geographycal region. For a proof-of-concept perspective, we use state to group our dataset. Those continuous variables (e.g. loan amount, loan-to-value ratio and debt-to-income ratio) are normalized by using their mean and standard deviation by each states.
In the future, a more refined geo-location grouping will be used, as the dataset does contain the first three digits of the zip code.
All those classes with Extract... names are Scikit Learn Transformer to extract features from each record. Those features are further combined via Scikit-Learn's feature-union. Keep in mind that in order to successfully conduct the feature-union, you have to make sure the return value of your transformer is "column-like". At the end of each transformer functions of each class, the reshape
line is there for that.
state_mean = fannie1_known.groupby('STATE')[('ORIG_AMT', 'OCLTV', 'DTI')].mean()
state_std = fannie1_known.groupby('STATE')[('ORIG_AMT', 'OCLTV', 'DTI')].std()
class ExtractOrigYear(sk.base.BaseEstimator, sk.base.TransformerMixin):
def __init__(self):
self.int = np.vectorize(int)
pass
def fit(self, x, y=None):
return self
def transform(self, x):
return self.int(x['ORIG_DTE'].apply(lambda x: x.split('/')[-1])).reshape(-1,1)
class extractfeatures(sk.base.BaseEstimator, sk.base.TransformerMixin):
def __init__(self, column='OCLTV'):
'''
We use the colname as the selection rule to judge the
'''
self.column = column
def fit(self, x, y):
return self
def transform(self, x):
return x.loc[:,self.column].values.reshape(-1,1)
class ExtractLoanStatus(sk.base.BaseEstimator, sk.base.TransformerMixin):
def __init__(self):
'''
Initialize the class with bisection of the loan status: Default or Healthy
'''
pass
def fit(self, x):
return self
def transform(self, x):
'''
Transform the loan status to a tertiary status: Healthy (0), Default (1)
'''
status = x['Zero_Bal_Code'].apply(lambda x: 0 if x <=1 else 1)
return status
class ExtractCreditScore(sk.base.BaseEstimator, sk.base.TransformerMixin):
def __init__(self, is_take_minimum = True):
self.take_minimum = is_take_minimum
self.where = np.where
pass
def fit(self, x, y):
return self
def transform(self, x):
result = self.where((x['CSCORE_B'] - x['CSCORE_C'] > 0), x['CSCORE_C'], x['CSCORE_B'])
return result.reshape(-1,1)
class ExtractCategory(sk.base.BaseEstimator, sk.base.TransformerMixin):
def __init__(self, colname):
self.colname = colname
self.transformer = LabelEncoder()
pass
def fit(self, x, y):
self.transformer.fit(x[self.colname])
return self
def transform(self, x):
return self.transformer.transform(x[self.colname]).reshape(-1,1)
class ExtractNormalized(sk.base.BaseEstimator, sk.base.TransformerMixin):
def __init__(self, groupby, target, total_mean=state_mean, total_std=state_std):
self.groupby = groupby
self.target = target
self.total_mean = total_mean
self.total_std = total_std
pass
def fit(self, x, y):
return self
def transform(self, x):
temp1 = x.groupby(self.groupby)[[self.groupby, self.target]].apply(lambda x: (x[self.target])).values
temp2 = x.groupby(self.groupby)[[self.groupby, self.target]].apply(lambda x: self.total_mean.ix[x[self.groupby].values,self.target]).values
temp3 = x.groupby(self.groupby)[[self.groupby, self.target]].apply(lambda x: self.total_std.ix[x[self.groupby].values,self.target]).values
return ((temp1-temp2)/temp3).reshape(-1,1)
We want to start with the very simple linear model to see how good we can achieve. We should notice that because we are dealing with a very biased classification issue. Therefore, we apply the StratifiedShuffleSplit
to ensure both training and testing sets contain same ratio of paid/default cases. Also, the option class_weight
is turned on and set to balanced
to tell the model to balance the weight between paid/default cases.
There are other approaches to deal with the unbalanced classification method, like undersampling, oversampling and SMOTE. We will use these methods in the next version of SmartUnderwriter.
features = FeatureUnion([
('Loan_Amount', ExtractNormalized('STATE','ORIG_AMT')),
('credit score', ExtractCreditScore()),
('Loan_to_Value', ExtractNormalized('STATE','OCLTV')),
('Debt_to_income', ExtractNormalized('STATE','DTI')),
('Loan_purpose', ExtractCategory('PURPOSE')),
('Property_Type', ExtractCategory('PROP_TYP')),
('Occupancy_Status', ExtractCategory('OCC_STAT'))
])
sss = StratifiedShuffleSplit(ExtractLoanStatus().fit_transform(fannie1_known), 1, test_size=0.15)
for train_index, test_index in sss:
fannie_train = fannie1_known.iloc[train_index,]
fannie_test = fannie1_known.iloc[test_index, ]
status_train = ExtractLoanStatus().fit_transform(fannie1_known).iloc[train_index,]
status_test = ExtractLoanStatus().fit_transform(fannie1_known).iloc[test_index,]
One of the most powerful feature of Scikit-Learn library is its grid search function. Here, we use cross-validation logistic model to search the best regulation parameter to give highest AUC score.
model2 = Pipeline([
('features', features),
('Logistic', LogisticRegressionCV(Cs = 10, scoring = 'roc_auc', class_weight='balanced'))
])
model2.fit(fannie_train, status_train)
print(model2.named_steps['Logistic'].C_)
print(model2.named_steps['Logistic'].coef_)
Feature coefficients for the best logistic model we have:
Loan Amount | Credit Score | Loan-to-Value ratio | Debt-to-Income ratio | Loan Purpose | Property Type | Occupancy Status |
---|---|---|---|---|---|---|
0.00161235 | -0.01686483 | -0.00233408 | 0.00108836 | -0.20423069 | -0.11163132 | -0.47136523 |
Remember label 1 means default and label 0 means paid. In general, the model gives a reasonable (qualitatively) explanation about how to have a less risky loan: lower loan amount, higher credit score etc.
status_pred2 = model2.predict(fannie_test)
print(classification_report(status_test, status_pred2))
print(pd.DataFrame(confusion_matrix(status_test, status_pred2), index=['Actual Healthy', 'Actual Default'],
columns = ['Pred. Healthy', 'Pred. Default']))
print('Area under the curve is',roc_auc_score(status_test, status_pred2))
Not surprisingly, this model gives a 11% precision on the default prediction, which means quite a bit of good loan has been categorized as potential default. As a contrast, the recall rate (which means predicted positive/all true positive, or power if you are statistician) is at 69%. You can adjust the precision/recall by adjusting the setpoint for the binary classification. This turns into the classic 'precision/recall' trade-off. I have a nice plot below to give you a better idea.
The AUC score of this model 0.71, which is not bad as the first shot with a simple linear model.
prec, rec, thres1 = precision_recall_curve(status_test, status_pred2)
fpr, tpr, thres2 = roc_curve(status_test, model2.decision_function(fannie_test))
plt.plot(prec,rec)
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.plot(fpr,tpr, label='Logistic Regression Model')
plt.plot([0,1],[0,1], 'r--', label='random model')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'upper left')
The next model to try is Support Vector Machine (SVM). However, SVM is notoriously slow especially when dealing with dataset size like here. Luckily, stochastic gradient descent based algorithm can speed up quite a bit. So, the second model to try is to use SGD classfier with a "SVM"-like hinge loss function.
SGD classifier is very sensitive to the scale of the data. So we put in a StandardScaler transformer to normalize the whole feature matrix.
features = FeatureUnion([
('Loan_Amount', ExtractNormalized('STATE','ORIG_AMT')),
('credit score', ExtractCreditScore()),
('Loan_to_Value', ExtractNormalized('STATE','OCLTV')),
('Debt_to_income', ExtractNormalized('STATE','DTI')),
('Loan_purpose', ExtractCategory('PURPOSE')),
('Property_Type', ExtractCategory('PROP_TYP')),
('Occupancy_Status', ExtractCategory('OCC_STAT'))
])
model4 = Pipeline([
('features', features),
('scale', StandardScaler()),
('SGD', SGDClassifier(loss = 'hinge', class_weight='balanced'))
])
model4.fit(fannie_train, status_train)
status_pred4 = model4.predict(fannie_test)
roc_auc_score(status_test, model4.decision_function(fannie_test))
pred4 = model4.predict(fannie_test)
print(classification_report(status_test,pred4))
print(pd.DataFrame(confusion_matrix(status_test, pred4), index=['Actual Healthy', 'Actual Default'],
columns = ['Pred. Healthy', 'Pred. Default']))
model4.named_steps['SGD'].coef_
SGD classifier did give a better AUC score (0.77). But the classification report shows that this model is similar to what we have by using logistic regression.
Coefficients in the model:
Loan Amount | Credit Score | Loan-to-Value ratio | Debt-to-Income ratio | Loan Purpose | Property Type | Occupancy Status |
---|---|---|---|---|---|---|
0.04530123 | -0.86460842 | 0.01849037 | -0.04807039 | -0.08237096 | -0.10498967 | -0.14175012 |
It would be good to grid search the SGDClassifier. However, due to the memory limitation, grid search was not conducted. In the future when I move all to the Spark module, we would redo the SGDClassifier.
This model is chosen to be the prediction model I use for the final prediction model.
Gradient boosting technology is shown to be an effective method to find a low bias/low variance model. Here, we test out the gradient boosting method. Initial result shows it has similar performance as logistic regression or SGDclassifier, using AUC score. Unfortunately, no grid search can be done using my 16GB Digital Ocean box.
features = FeatureUnion([
('Loan_Amount', ExtractNormalized('STATE','ORIG_AMT')),
('credit score', ExtractCreditScore()),
('Loan_to_Value', ExtractNormalized('STATE','OCLTV')),
('Debt_to_income', ExtractNormalized('STATE','DTI')),
('Loan_purpose', ExtractCategory('PURPOSE')),
('Property_Type', ExtractCategory('PROP_TYP')),
('Occupancy_Status', ExtractCategory('OCC_STAT'))
])
model5 = Pipeline([
('features', features),
('GDB', GradientBoostingClassifier(n_estimators=200, learning_rate=0.5))
])
model5.fit(fannie_train, status_train)
status_pred5 = model5.predict(fannie_test)
roc_auc_score(status_test, model5.decision_function(fannie_test))