import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
df = pd.read_csv('churn.txt')#Reading the dataset in a dataframe using Pandas
df.describe()
<tr style="text-align: right;">
  <th></th>
  <th>Account Length</th>
  <th>Area Code</th>
  <th>VMail Message</th>
  <th>Day Mins</th>
  <th>Day Calls</th>
  <th>Day Charge</th>
  <th>Eve Mins</th>
  <th>Eve Calls</th>
  <th>Eve Charge</th>
  <th>Night Mins</th>
  <th>Night Calls</th>
  <th>Night Charge</th>
  <th>Intl Mins</th>
  <th>Intl Calls</th>
  <th>Intl Charge</th>
  <th>CustServ Calls</th>
</tr>
<tr>
  <th>count</th>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
  <td>3333.000000</td>
</tr>
<tr>
  <th>mean</th>
  <td>101.064806</td>
  <td>437.182418</td>
  <td>8.099010</td>
  <td>179.775098</td>
  <td>100.435644</td>
  <td>30.562307</td>
  <td>200.980348</td>
  <td>100.114311</td>
  <td>17.083540</td>
  <td>200.872037</td>
  <td>100.107711</td>
  <td>9.039325</td>
  <td>10.237294</td>
  <td>4.479448</td>
  <td>2.764581</td>
  <td>1.562856</td>
</tr>
<tr>
  <th>std</th>
  <td>39.822106</td>
  <td>42.371290</td>
  <td>13.688365</td>
  <td>54.467389</td>
  <td>20.069084</td>
  <td>9.259435</td>
  <td>50.713844</td>
  <td>19.922625</td>
  <td>4.310668</td>
  <td>50.573847</td>
  <td>19.568609</td>
  <td>2.275873</td>
  <td>2.791840</td>
  <td>2.461214</td>
  <td>0.753773</td>
  <td>1.315491</td>
</tr>
<tr>
  <th>min</th>
  <td>1.000000</td>
  <td>408.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>23.200000</td>
  <td>33.000000</td>
  <td>1.040000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
  <td>0.000000</td>
</tr>
<tr>
  <th>25%</th>
  <td>74.000000</td>
  <td>408.000000</td>
  <td>0.000000</td>
  <td>143.700000</td>
  <td>87.000000</td>
  <td>24.430000</td>
  <td>166.600000</td>
  <td>87.000000</td>
  <td>14.160000</td>
  <td>167.000000</td>
  <td>87.000000</td>
  <td>7.520000</td>
  <td>8.500000</td>
  <td>3.000000</td>
  <td>2.300000</td>
  <td>1.000000</td>
</tr>
<tr>
  <th>50%</th>
  <td>101.000000</td>
  <td>415.000000</td>
  <td>0.000000</td>
  <td>179.400000</td>
  <td>101.000000</td>
  <td>30.500000</td>
  <td>201.400000</td>
  <td>100.000000</td>
  <td>17.120000</td>
  <td>201.200000</td>
  <td>100.000000</td>
  <td>9.050000</td>
  <td>10.300000</td>
  <td>4.000000</td>
  <td>2.780000</td>
  <td>1.000000</td>
</tr>
<tr>
  <th>75%</th>
  <td>127.000000</td>
  <td>510.000000</td>
  <td>20.000000</td>
  <td>216.400000</td>
  <td>114.000000</td>
  <td>36.790000</td>
  <td>235.300000</td>
  <td>114.000000</td>
  <td>20.000000</td>
  <td>235.300000</td>
  <td>113.000000</td>
  <td>10.590000</td>
  <td>12.100000</td>
  <td>6.000000</td>
  <td>3.270000</td>
  <td>2.000000</td>
</tr>
<tr>
  <th>max</th>
  <td>243.000000</td>
  <td>510.000000</td>
  <td>51.000000</td>
  <td>350.800000</td>
  <td>165.000000</td>
  <td>59.640000</td>
  <td>363.700000</td>
  <td>170.000000</td>
  <td>30.910000</td>
  <td>395.000000</td>
  <td>175.000000</td>
  <td>17.770000</td>
  <td>20.000000</td>
  <td>20.000000</td>
  <td>5.400000</td>
  <td>9.000000</td>
</tr>
df['Churn?'].value_counts()
False.    2850
True.      483
Name: Churn?, dtype: int64
df['Night Calls'].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2b715278>
png

png

df.boxplot(column='Night Calls')
<matplotlib.axes._subplots.AxesSubplot at 0x1293d5240>
png

png

df.boxplot(column='Night Calls', by = 'VMail Plan')
<matplotlib.axes._subplots.AxesSubplot at 0x12942af28>
png

png

df['Night Mins'].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2b85aa58>
png

png

df.boxplot(column='Night Mins')
<matplotlib.axes._subplots.AxesSubplot at 0x1a2ba61668>
png

png

temp1 = df['Churn?'].value_counts(ascending=True)
temp2 = df.pivot_table(values='Night Mins',index=['Churn?'],aggfunc=lambda x: x.map({'Y':1,'N':0}).mean())
print ('Frequency Table for Credit History:') 
print (temp1)

print ('\nProbility of getting loan for each Credit History class:')
print (temp2)
Frequency Table for Credit History:
True.      483
False.    2850
Name: Churn?, dtype: int64

Probility of getting loan for each Credit History class:
Empty DataFrame
Columns: []
Index: [False., True.]
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Churn?')
ax1.set_ylabel('Night Mins')
ax1.set_title("Applicants by Credit_History")
temp1.plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x1a2bad1c18>
png

png

temp3 = pd.crosstab(df['Churn?'], df['VMail Plan'])
temp3.plot(kind='bar', stacked=True, color=['red','blue'], grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0x1a2bd67470>
png

png

df.apply(lambda x: sum(x.isnull()),axis=0) 
State             0
Account Length    0
Area Code         0
Phone             0
Int'l Plan        0
VMail Plan        0
VMail Message     0
Day Mins          0
Day Calls         0
Day Charge        0
Eve Mins          0
Eve Calls         0
Eve Charge        0
Night Mins        0
Night Calls       0
Night Charge      0
Intl Mins         0
Intl Calls        0
Intl Charge       0
CustServ Calls    0
Churn?            0
dtype: int64
df.head(10)
<tr style="text-align: right;">
  <th></th>
  <th>State</th>
  <th>Account Length</th>
  <th>Area Code</th>
  <th>Phone</th>
  <th>Int'l Plan</th>
  <th>VMail Plan</th>
  <th>VMail Message</th>
  <th>Day Mins</th>
  <th>Day Calls</th>
  <th>Day Charge</th>
  <th>...</th>
  <th>Eve Calls</th>
  <th>Eve Charge</th>
  <th>Night Mins</th>
  <th>Night Calls</th>
  <th>Night Charge</th>
  <th>Intl Mins</th>
  <th>Intl Calls</th>
  <th>Intl Charge</th>
  <th>CustServ Calls</th>
  <th>Churn?</th>
</tr>
<tr>
  <th>0</th>
  <td>KS</td>
  <td>128</td>
  <td>415</td>
  <td>382-4657</td>
  <td>no</td>
  <td>yes</td>
  <td>25</td>
  <td>265.1</td>
  <td>110</td>
  <td>45.07</td>
  <td>...</td>
  <td>99</td>
  <td>16.78</td>
  <td>244.7</td>
  <td>91</td>
  <td>11.01</td>
  <td>10.0</td>
  <td>3</td>
  <td>2.70</td>
  <td>1</td>
  <td>False.</td>
</tr>
<tr>
  <th>1</th>
  <td>OH</td>
  <td>107</td>
  <td>415</td>
  <td>371-7191</td>
  <td>no</td>
  <td>yes</td>
  <td>26</td>
  <td>161.6</td>
  <td>123</td>
  <td>27.47</td>
  <td>...</td>
  <td>103</td>
  <td>16.62</td>
  <td>254.4</td>
  <td>103</td>
  <td>11.45</td>
  <td>13.7</td>
  <td>3</td>
  <td>3.70</td>
  <td>1</td>
  <td>False.</td>
</tr>
<tr>
  <th>2</th>
  <td>NJ</td>
  <td>137</td>
  <td>415</td>
  <td>358-1921</td>
  <td>no</td>
  <td>no</td>
  <td>0</td>
  <td>243.4</td>
  <td>114</td>
  <td>41.38</td>
  <td>...</td>
  <td>110</td>
  <td>10.30</td>
  <td>162.6</td>
  <td>104</td>
  <td>7.32</td>
  <td>12.2</td>
  <td>5</td>
  <td>3.29</td>
  <td>0</td>
  <td>False.</td>
</tr>
<tr>
  <th>3</th>
  <td>OH</td>
  <td>84</td>
  <td>408</td>
  <td>375-9999</td>
  <td>yes</td>
  <td>no</td>
  <td>0</td>
  <td>299.4</td>
  <td>71</td>
  <td>50.90</td>
  <td>...</td>
  <td>88</td>
  <td>5.26</td>
  <td>196.9</td>
  <td>89</td>
  <td>8.86</td>
  <td>6.6</td>
  <td>7</td>
  <td>1.78</td>
  <td>2</td>
  <td>False.</td>
</tr>
<tr>
  <th>4</th>
  <td>OK</td>
  <td>75</td>
  <td>415</td>
  <td>330-6626</td>
  <td>yes</td>
  <td>no</td>
  <td>0</td>
  <td>166.7</td>
  <td>113</td>
  <td>28.34</td>
  <td>...</td>
  <td>122</td>
  <td>12.61</td>
  <td>186.9</td>
  <td>121</td>
  <td>8.41</td>
  <td>10.1</td>
  <td>3</td>
  <td>2.73</td>
  <td>3</td>
  <td>False.</td>
</tr>
<tr>
  <th>5</th>
  <td>AL</td>
  <td>118</td>
  <td>510</td>
  <td>391-8027</td>
  <td>yes</td>
  <td>no</td>
  <td>0</td>
  <td>223.4</td>
  <td>98</td>
  <td>37.98</td>
  <td>...</td>
  <td>101</td>
  <td>18.75</td>
  <td>203.9</td>
  <td>118</td>
  <td>9.18</td>
  <td>6.3</td>
  <td>6</td>
  <td>1.70</td>
  <td>0</td>
  <td>False.</td>
</tr>
<tr>
  <th>6</th>
  <td>MA</td>
  <td>121</td>
  <td>510</td>
  <td>355-9993</td>
  <td>no</td>
  <td>yes</td>
  <td>24</td>
  <td>218.2</td>
  <td>88</td>
  <td>37.09</td>
  <td>...</td>
  <td>108</td>
  <td>29.62</td>
  <td>212.6</td>
  <td>118</td>
  <td>9.57</td>
  <td>7.5</td>
  <td>7</td>
  <td>2.03</td>
  <td>3</td>
  <td>False.</td>
</tr>
<tr>
  <th>7</th>
  <td>MO</td>
  <td>147</td>
  <td>415</td>
  <td>329-9001</td>
  <td>yes</td>
  <td>no</td>
  <td>0</td>
  <td>157.0</td>
  <td>79</td>
  <td>26.69</td>
  <td>...</td>
  <td>94</td>
  <td>8.76</td>
  <td>211.8</td>
  <td>96</td>
  <td>9.53</td>
  <td>7.1</td>
  <td>6</td>
  <td>1.92</td>
  <td>0</td>
  <td>False.</td>
</tr>
<tr>
  <th>8</th>
  <td>LA</td>
  <td>117</td>
  <td>408</td>
  <td>335-4719</td>
  <td>no</td>
  <td>no</td>
  <td>0</td>
  <td>184.5</td>
  <td>97</td>
  <td>31.37</td>
  <td>...</td>
  <td>80</td>
  <td>29.89</td>
  <td>215.8</td>
  <td>90</td>
  <td>9.71</td>
  <td>8.7</td>
  <td>4</td>
  <td>2.35</td>
  <td>1</td>
  <td>False.</td>
</tr>
<tr>
  <th>9</th>
  <td>WV</td>
  <td>141</td>
  <td>415</td>
  <td>330-8173</td>
  <td>yes</td>
  <td>yes</td>
  <td>37</td>
  <td>258.6</td>
  <td>84</td>
  <td>43.96</td>
  <td>...</td>
  <td>111</td>
  <td>18.87</td>
  <td>326.4</td>
  <td>97</td>
  <td>14.69</td>
  <td>11.2</td>
  <td>5</td>
  <td>3.02</td>
  <td>0</td>
  <td>False.</td>
</tr>

10 rows × 21 columns

from sklearn.preprocessing import LabelEncoder
var_mod = ['State',"Int'l Plan",'VMail Plan','Churn?']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])
df.dtypes 
State               int64
Account Length      int64
Area Code           int64
Phone              object
Int'l Plan          int64
VMail Plan          int64
VMail Message       int64
Day Mins          float64
Day Calls           int64
Day Charge        float64
Eve Mins          float64
Eve Calls           int64
Eve Charge        float64
Night Mins        float64
Night Calls         int64
Night Charge      float64
Intl Mins         float64
Intl Calls          int64
Intl Charge       float64
CustServ Calls      int64
Churn?              int64
dtype: object
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0], n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 
outcome_var = 'Churn?'
model = LogisticRegression()
predictor_var = ['Day Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 85.509%
Cross-Validation Score : 85.508%
#We can try different combination of variables:
predictor_var = ['Day Charge','Intl Charge','Night Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 85.569%
Cross-Validation Score : 85.568%
model = DecisionTreeClassifier()
predictor_var = ['Day Charge','Intl Charge','Night Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 100.000%
Cross-Validation Score : 77.858%
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Day Charge','Intl Charge','Night Charge']
classification_model(model, df,predictor_var,outcome_var)
Accuracy : 100.000%
Cross-Validation Score : 85.868%
#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
print (featimp)
Day Charge      0.455067
Night Charge    0.309567
Intl Charge     0.235366
dtype: float64

nice bsb