import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
df = pd.read_csv('churn.txt')#Reading the dataset in a dataframe using Pandas
df.describe()
<tr style="text-align: right;">
<th></th>
<th>Account Length</th>
<th>Area Code</th>
<th>VMail Message</th>
<th>Day Mins</th>
<th>Day Calls</th>
<th>Day Charge</th>
<th>Eve Mins</th>
<th>Eve Calls</th>
<th>Eve Charge</th>
<th>Night Mins</th>
<th>Night Calls</th>
<th>Night Charge</th>
<th>Intl Mins</th>
<th>Intl Calls</th>
<th>Intl Charge</th>
<th>CustServ Calls</th>
</tr>
<tr>
<th>count</th>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
<td>3333.000000</td>
</tr>
<tr>
<th>mean</th>
<td>101.064806</td>
<td>437.182418</td>
<td>8.099010</td>
<td>179.775098</td>
<td>100.435644</td>
<td>30.562307</td>
<td>200.980348</td>
<td>100.114311</td>
<td>17.083540</td>
<td>200.872037</td>
<td>100.107711</td>
<td>9.039325</td>
<td>10.237294</td>
<td>4.479448</td>
<td>2.764581</td>
<td>1.562856</td>
</tr>
<tr>
<th>std</th>
<td>39.822106</td>
<td>42.371290</td>
<td>13.688365</td>
<td>54.467389</td>
<td>20.069084</td>
<td>9.259435</td>
<td>50.713844</td>
<td>19.922625</td>
<td>4.310668</td>
<td>50.573847</td>
<td>19.568609</td>
<td>2.275873</td>
<td>2.791840</td>
<td>2.461214</td>
<td>0.753773</td>
<td>1.315491</td>
</tr>
<tr>
<th>min</th>
<td>1.000000</td>
<td>408.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>23.200000</td>
<td>33.000000</td>
<td>1.040000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>25%</th>
<td>74.000000</td>
<td>408.000000</td>
<td>0.000000</td>
<td>143.700000</td>
<td>87.000000</td>
<td>24.430000</td>
<td>166.600000</td>
<td>87.000000</td>
<td>14.160000</td>
<td>167.000000</td>
<td>87.000000</td>
<td>7.520000</td>
<td>8.500000</td>
<td>3.000000</td>
<td>2.300000</td>
<td>1.000000</td>
</tr>
<tr>
<th>50%</th>
<td>101.000000</td>
<td>415.000000</td>
<td>0.000000</td>
<td>179.400000</td>
<td>101.000000</td>
<td>30.500000</td>
<td>201.400000</td>
<td>100.000000</td>
<td>17.120000</td>
<td>201.200000</td>
<td>100.000000</td>
<td>9.050000</td>
<td>10.300000</td>
<td>4.000000</td>
<td>2.780000</td>
<td>1.000000</td>
</tr>
<tr>
<th>75%</th>
<td>127.000000</td>
<td>510.000000</td>
<td>20.000000</td>
<td>216.400000</td>
<td>114.000000</td>
<td>36.790000</td>
<td>235.300000</td>
<td>114.000000</td>
<td>20.000000</td>
<td>235.300000</td>
<td>113.000000</td>
<td>10.590000</td>
<td>12.100000</td>
<td>6.000000</td>
<td>3.270000</td>
<td>2.000000</td>
</tr>
<tr>
<th>max</th>
<td>243.000000</td>
<td>510.000000</td>
<td>51.000000</td>
<td>350.800000</td>
<td>165.000000</td>
<td>59.640000</td>
<td>363.700000</td>
<td>170.000000</td>
<td>30.910000</td>
<td>395.000000</td>
<td>175.000000</td>
<td>17.770000</td>
<td>20.000000</td>
<td>20.000000</td>
<td>5.400000</td>
<td>9.000000</td>
</tr>
df['Churn?'].value_counts()
False. 2850
True. 483
Name: Churn?, dtype: int64
df['Night Calls'].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2b715278>
df.boxplot(column='Night Calls')
<matplotlib.axes._subplots.AxesSubplot at 0x1293d5240>
df.boxplot(column='Night Calls', by = 'VMail Plan')
<matplotlib.axes._subplots.AxesSubplot at 0x12942af28>
df['Night Mins'].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2b85aa58>
df.boxplot(column='Night Mins')
<matplotlib.axes._subplots.AxesSubplot at 0x1a2ba61668>
temp1 = df['Churn?'].value_counts(ascending=True)
temp2 = df.pivot_table(values='Night Mins',index=['Churn?'],aggfunc=lambda x: x.map({'Y':1,'N':0}).mean())
print ('Frequency Table for Credit History:')
print (temp1)
print ('\nProbility of getting loan for each Credit History class:')
print (temp2)
Frequency Table for Credit History:
True. 483
False. 2850
Name: Churn?, dtype: int64
Probility of getting loan for each Credit History class:
Empty DataFrame
Columns: []
Index: [False., True.]
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Churn?')
ax1.set_ylabel('Night Mins')
ax1.set_title("Applicants by Credit_History")
temp1.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1a2bad1c18>
temp3 = pd.crosstab(df['Churn?'], df['VMail Plan'])
temp3.plot(kind='bar', stacked=True, color=['red','blue'], grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2bd67470>
df.apply(lambda x: sum(x.isnull()),axis=0)
State 0
Account Length 0
Area Code 0
Phone 0
Int'l Plan 0
VMail Plan 0
VMail Message 0
Day Mins 0
Day Calls 0
Day Charge 0
Eve Mins 0
Eve Calls 0
Eve Charge 0
Night Mins 0
Night Calls 0
Night Charge 0
Intl Mins 0
Intl Calls 0
Intl Charge 0
CustServ Calls 0
Churn? 0
dtype: int64
df.head(10)
<tr style="text-align: right;">
<th></th>
<th>State</th>
<th>Account Length</th>
<th>Area Code</th>
<th>Phone</th>
<th>Int'l Plan</th>
<th>VMail Plan</th>
<th>VMail Message</th>
<th>Day Mins</th>
<th>Day Calls</th>
<th>Day Charge</th>
<th>...</th>
<th>Eve Calls</th>
<th>Eve Charge</th>
<th>Night Mins</th>
<th>Night Calls</th>
<th>Night Charge</th>
<th>Intl Mins</th>
<th>Intl Calls</th>
<th>Intl Charge</th>
<th>CustServ Calls</th>
<th>Churn?</th>
</tr>
<tr>
<th>0</th>
<td>KS</td>
<td>128</td>
<td>415</td>
<td>382-4657</td>
<td>no</td>
<td>yes</td>
<td>25</td>
<td>265.1</td>
<td>110</td>
<td>45.07</td>
<td>...</td>
<td>99</td>
<td>16.78</td>
<td>244.7</td>
<td>91</td>
<td>11.01</td>
<td>10.0</td>
<td>3</td>
<td>2.70</td>
<td>1</td>
<td>False.</td>
</tr>
<tr>
<th>1</th>
<td>OH</td>
<td>107</td>
<td>415</td>
<td>371-7191</td>
<td>no</td>
<td>yes</td>
<td>26</td>
<td>161.6</td>
<td>123</td>
<td>27.47</td>
<td>...</td>
<td>103</td>
<td>16.62</td>
<td>254.4</td>
<td>103</td>
<td>11.45</td>
<td>13.7</td>
<td>3</td>
<td>3.70</td>
<td>1</td>
<td>False.</td>
</tr>
<tr>
<th>2</th>
<td>NJ</td>
<td>137</td>
<td>415</td>
<td>358-1921</td>
<td>no</td>
<td>no</td>
<td>0</td>
<td>243.4</td>
<td>114</td>
<td>41.38</td>
<td>...</td>
<td>110</td>
<td>10.30</td>
<td>162.6</td>
<td>104</td>
<td>7.32</td>
<td>12.2</td>
<td>5</td>
<td>3.29</td>
<td>0</td>
<td>False.</td>
</tr>
<tr>
<th>3</th>
<td>OH</td>
<td>84</td>
<td>408</td>
<td>375-9999</td>
<td>yes</td>
<td>no</td>
<td>0</td>
<td>299.4</td>
<td>71</td>
<td>50.90</td>
<td>...</td>
<td>88</td>
<td>5.26</td>
<td>196.9</td>
<td>89</td>
<td>8.86</td>
<td>6.6</td>
<td>7</td>
<td>1.78</td>
<td>2</td>
<td>False.</td>
</tr>
<tr>
<th>4</th>
<td>OK</td>
<td>75</td>
<td>415</td>
<td>330-6626</td>
<td>yes</td>
<td>no</td>
<td>0</td>
<td>166.7</td>
<td>113</td>
<td>28.34</td>
<td>...</td>
<td>122</td>
<td>12.61</td>
<td>186.9</td>
<td>121</td>
<td>8.41</td>
<td>10.1</td>
<td>3</td>
<td>2.73</td>
<td>3</td>
<td>False.</td>
</tr>
<tr>
<th>5</th>
<td>AL</td>
<td>118</td>
<td>510</td>
<td>391-8027</td>
<td>yes</td>
<td>no</td>
<td>0</td>
<td>223.4</td>
<td>98</td>
<td>37.98</td>
<td>...</td>
<td>101</td>
<td>18.75</td>
<td>203.9</td>
<td>118</td>
<td>9.18</td>
<td>6.3</td>
<td>6</td>
<td>1.70</td>
<td>0</td>
<td>False.</td>
</tr>
<tr>
<th>6</th>
<td>MA</td>
<td>121</td>
<td>510</td>
<td>355-9993</td>
<td>no</td>
<td>yes</td>
<td>24</td>
<td>218.2</td>
<td>88</td>
<td>37.09</td>
<td>...</td>
<td>108</td>
<td>29.62</td>
<td>212.6</td>
<td>118</td>
<td>9.57</td>
<td>7.5</td>
<td>7</td>
<td>2.03</td>
<td>3</td>
<td>False.</td>
</tr>
<tr>
<th>7</th>
<td>MO</td>
<td>147</td>
<td>415</td>
<td>329-9001</td>
<td>yes</td>
<td>no</td>
<td>0</td>
<td>157.0</td>
<td>79</td>
<td>26.69</td>
<td>...</td>
<td>94</td>
<td>8.76</td>
<td>211.8</td>
<td>96</td>
<td>9.53</td>
<td>7.1</td>
<td>6</td>
<td>1.92</td>
<td>0</td>
<td>False.</td>
</tr>
<tr>
<th>8</th>
<td>LA</td>
<td>117</td>
<td>408</td>
<td>335-4719</td>
<td>no</td>
<td>no</td>
<td>0</td>
<td>184.5</td>
<td>97</td>
<td>31.37</td>
<td>...</td>
<td>80</td>
<td>29.89</td>
<td>215.8</td>
<td>90</td>
<td>9.71</td>
<td>8.7</td>
<td>4</td>
<td>2.35</td>
<td>1</td>
<td>False.</td>
</tr>
<tr>
<th>9</th>
<td>WV</td>
<td>141</td>
<td>415</td>
<td>330-8173</td>
<td>yes</td>
<td>yes</td>
<td>37</td>
<td>258.6</td>
<td>84</td>
<td>43.96</td>
<td>...</td>
<td>111</td>
<td>18.87</td>
<td>326.4</td>
<td>97</td>
<td>14.69</td>
<td>11.2</td>
<td>5</td>
<td>3.02</td>
<td>0</td>
<td>False.</td>
</tr>
10 rows × 21 columns
from sklearn.preprocessing import LabelEncoder
var_mod = ['State',"Int'l Plan",'VMail Plan','Churn?']
le = LabelEncoder()
for i in var_mod:
df[i] = le.fit_transform(df[i])
df.dtypes
State int64
Account Length int64
Area Code int64
Phone object
Int'l Plan int64
VMail Plan int64
VMail Message int64
Day Mins float64
Day Calls int64
Day Charge float64
Eve Mins float64
Eve Calls int64
Eve Charge float64
Night Mins float64
Night Calls int64
Night Charge float64
Intl Mins float64
Intl Calls int64
Intl Charge float64
CustServ Calls int64
Churn? int64
dtype: object
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
#Fit the model:
model.fit(data[predictors],data[outcome])
#Make predictions on training set:
predictions = model.predict(data[predictors])
#Print accuracy
accuracy = metrics.accuracy_score(predictions,data[outcome])
print ("Accuracy : %s" % "{0:.3%}".format(accuracy))
#Perform k-fold cross-validation with 5 folds
kf = KFold(data.shape[0], n_folds=5)
error = []
for train, test in kf:
# Filter training data
train_predictors = (data[predictors].iloc[train,:])
# The target we're using to train the algorithm.
train_target = data[outcome].iloc[train]
# Training the algorithm using the predictors and target.
model.fit(train_predictors, train_target)
#Record error from each cross-validation run
error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
#Fit the model again so that it can be refered outside the function:
model.fit(data[predictors],data[outcome])
outcome_var = 'Churn?'
model = LogisticRegression()
predictor_var = ['Day Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 85.509%
Cross-Validation Score : 85.508%
#We can try different combination of variables:
predictor_var = ['Day Charge','Intl Charge','Night Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 85.569%
Cross-Validation Score : 85.568%
model = DecisionTreeClassifier()
predictor_var = ['Day Charge','Intl Charge','Night Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 100.000%
Cross-Validation Score : 77.858%
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Day Charge','Intl Charge','Night Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 100.000%
Cross-Validation Score : 85.868%
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
print (featimp)
Day Charge 0.455067
Night Charge 0.309567
Intl Charge 0.235366
dtype: float64
nice bsb