library(ggplot2)
library(ggthemes)
library(dplyr)
library(corrgram)
library(corrplot)
library(caTools)
library(Amelia)
library(ISLR)
library(class)
library(rpart)
library(rpart.plot)
library(randomForest)
library(e1071)
library(cluster)
library(tm)
library(twitteR)
library(RColorBrewer)
library(wordcloud)
library(MASS)
library(neuralnet)

Linear Regression

df<-read.csv("student-mat.csv", sep=";")
head(df)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob
## 1     GP   F  18       U     GT3       A    4    4  at_home  teacher
## 2     GP   F  17       U     GT3       T    1    1  at_home    other
## 3     GP   F  15       U     LE3       T    1    1  at_home    other
## 4     GP   F  15       U     GT3       T    4    2   health services
## 5     GP   F  16       U     GT3       T    3    3    other    other
## 6     GP   M  16       U     LE3       T    4    3 services    other
##       reason guardian traveltime studytime failures schoolsup famsup paid
## 1     course   mother          2         2        0       yes     no   no
## 2     course   father          1         2        0        no    yes   no
## 3      other   mother          1         2        3       yes     no  yes
## 4       home   mother          1         3        0        no    yes  yes
## 5       home   father          1         2        0        no    yes  yes
## 6 reputation   mother          1         2        0        no    yes  yes
##   activities nursery higher internet romantic famrel freetime goout Dalc
## 1         no     yes    yes       no       no      4        3     4    1
## 2         no      no    yes      yes       no      5        3     3    1
## 3         no     yes    yes      yes       no      4        3     2    2
## 4        yes     yes    yes      yes      yes      3        2     2    1
## 5         no     yes    yes       no       no      4        3     2    1
## 6        yes     yes    yes      yes       no      5        4     2    1
##   Walc health absences G1 G2 G3
## 1    1      3        6  5  6  6
## 2    1      3        4  5  5  6
## 3    3      3       10  7  8 10
## 4    1      5        2 15 14 15
## 5    2      5        4  6 10 10
## 6    2      5       10 15 15 15
summary(df)
##  school   sex          age       address famsize   Pstatus      Medu      
##  GP:349   F:208   Min.   :15.0   R: 88   GT3:281   A: 41   Min.   :0.000  
##  MS: 46   M:187   1st Qu.:16.0   U:307   LE3:114   T:354   1st Qu.:2.000  
##                   Median :17.0                             Median :3.000  
##                   Mean   :16.7                             Mean   :2.749  
##                   3rd Qu.:18.0                             3rd Qu.:4.000  
##                   Max.   :22.0                             Max.   :4.000  
##       Fedu             Mjob           Fjob            reason   
##  Min.   :0.000   at_home : 59   at_home : 20   course    :145  
##  1st Qu.:2.000   health  : 34   health  : 18   home      :109  
##  Median :2.000   other   :141   other   :217   other     : 36  
##  Mean   :2.522   services:103   services:111   reputation:105  
##  3rd Qu.:3.000   teacher : 58   teacher : 29                   
##  Max.   :4.000                                                 
##    guardian     traveltime      studytime        failures      schoolsup
##  father: 90   Min.   :1.000   Min.   :1.000   Min.   :0.0000   no :344  
##  mother:273   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   yes: 51  
##  other : 32   Median :1.000   Median :2.000   Median :0.0000            
##               Mean   :1.448   Mean   :2.035   Mean   :0.3342            
##               3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000            
##               Max.   :4.000   Max.   :4.000   Max.   :3.0000            
##  famsup     paid     activities nursery   higher    internet  romantic 
##  no :153   no :214   no :194    no : 81   no : 20   no : 66   no :263  
##  yes:242   yes:181   yes:201    yes:314   yes:375   yes:329   yes:132  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##      famrel         freetime         goout            Dalc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :4.000   Median :3.000   Median :3.000   Median :1.000  
##  Mean   :3.944   Mean   :3.235   Mean   :3.109   Mean   :1.481  
##  3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##       Walc           health         absences            G1       
##  Min.   :1.000   Min.   :1.000   Min.   : 0.000   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00  
##  Median :2.000   Median :4.000   Median : 4.000   Median :11.00  
##  Mean   :2.291   Mean   :3.554   Mean   : 5.709   Mean   :10.91  
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :5.000   Max.   :75.000   Max.   :19.00  
##        G2              G3       
##  Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 9.00   1st Qu.: 8.00  
##  Median :11.00   Median :11.00  
##  Mean   :10.71   Mean   :10.42  
##  3rd Qu.:13.00   3rd Qu.:14.00  
##  Max.   :19.00   Max.   :20.00
#Check for NA
any(is.na(df))
## [1] FALSE
#Structure of Data
str(df)
## 'data.frame':    395 obs. of  33 variables:
##  $ school    : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob      : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason    : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian  : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet  : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : int  6 6 10 15 10 15 11 6 19 15 ...
#Numeric only
num.cols<-sapply(df, is.numeric)

#filter the data and calculate the correlation matrix
corr.data<-cor(df[,num.cols])
corrplot(corr.data, method='color')

#Corrgram can deal with factros too
corrgram(df)

corrgram(df, order=TRUE)

corrgram(df, order=TRUE, upper.pane=panel.pie)

#A ggplot
ggplot(df, aes(x=G3))+geom_histogram(bins=20, alpha=0.5, fill="green")

split the Data into Train and Test set

set.seed(101)
#Split up Sample
sample<-sample.split(df$G3,SplitRatio = 0.7)
#70% of the data
train<-subset(df, sample==TRUE)

#30% of the data
test<-subset(df, sample==FALSE)

#Train & build the model
model<-lm(G3~., data=train)

summary(model)
## 
## Call:
## lm(formula = G3 ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4250 -0.6478  0.2844  1.0442  4.9840 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.70763    2.69488   1.376  0.17019    
## schoolMS          0.66981    0.47436   1.412  0.15926    
## sexM              0.25730    0.29257   0.879  0.38006    
## age              -0.36163    0.12949  -2.793  0.00566 ** 
## addressU          0.08123    0.35652   0.228  0.81996    
## famsizeLE3        0.12222    0.28709   0.426  0.67070    
## PstatusT          0.06807    0.43032   0.158  0.87444    
## Medu              0.11100    0.18757   0.592  0.55455    
## Fedu             -0.16373    0.15928  -1.028  0.30503    
## Mjobhealth       -0.63993    0.65314  -0.980  0.32820    
## Mjobother        -0.15730    0.42323  -0.372  0.71048    
## Mjobservices     -0.15872    0.46682  -0.340  0.73415    
## Mjobteacher      -0.04930    0.62335  -0.079  0.93702    
## Fjobhealth        0.17565    0.83034   0.212  0.83265    
## Fjobother        -0.29559    0.56012  -0.528  0.59818    
## Fjobservices     -0.76964    0.59476  -1.294  0.19692    
## Fjobteacher      -0.27009    0.73824  -0.366  0.71480    
## reasonhome       -0.41126    0.31857  -1.291  0.19799    
## reasonother       0.06767    0.45323   0.149  0.88144    
## reasonreputation  0.13478    0.34735   0.388  0.69834    
## guardianmother   -0.05442    0.31663  -0.172  0.86369    
## guardianother     0.01588    0.58375   0.027  0.97832    
## traveltime       -0.02353    0.19540  -0.120  0.90427    
## studytime        -0.04294    0.16910  -0.254  0.79979    
## failures         -0.17219    0.19668  -0.875  0.38220    
## schoolsupyes      0.20742    0.42358   0.490  0.62481    
## famsupyes        -0.05329    0.27753  -0.192  0.84789    
## paidyes           0.31311    0.28284   1.107  0.26941    
## activitiesyes    -0.26104    0.26687  -0.978  0.32901    
## nurseryyes       -0.05345    0.31236  -0.171  0.86428    
## higheryes        -0.94298    0.74005  -1.274  0.20385    
## internetyes      -0.15834    0.37029  -0.428  0.66932    
## romanticyes      -0.30048    0.28115  -1.069  0.28627    
## famrel            0.36601    0.14609   2.505  0.01291 *  
## freetime          0.08386    0.14247   0.589  0.55668    
## goout            -0.12457    0.13306  -0.936  0.35015    
## Dalc             -0.16995    0.20659  -0.823  0.41153    
## Walc              0.21053    0.14963   1.407  0.16074    
## health            0.07805    0.09341   0.836  0.40426    
## absences          0.09547    0.02382   4.008 8.24e-05 ***
## G1                0.14259    0.07892   1.807  0.07206 .  
## G2                0.98859    0.06929  14.267  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.962 on 235 degrees of freedom
## Multiple R-squared:  0.8456, Adjusted R-squared:  0.8187 
## F-statistic: 31.39 on 41 and 235 DF,  p-value: < 2.2e-16
#Predictions
G3.predictions<-predict(model,test)

results<-cbind(G3.predictions, test$G3)
colnames(results)<-c('predicted', 'actual')
results<-as.data.frame(results)

#Take care of negative values
to_zero<- function(x){
  if (x<0) {
    return(0)
  } else {return(x)}
  }
    
results$predicted<-sapply(results$predicted, to_zero)

mse<-mean((results$predicted-results$actual)^2)

rmse<-sqrt(mse)

mse
## [1] 3.991675
rmse
## [1] 1.997918

Bike Project from Kaggle

bike<-read.csv('bikeshare.csv')
head(bike)
##              datetime season holiday workingday weather temp  atemp
## 1 2011-01-01 00:00:00      1       0          0       1 9.84 14.395
## 2 2011-01-01 01:00:00      1       0          0       1 9.02 13.635
## 3 2011-01-01 02:00:00      1       0          0       1 9.02 13.635
## 4 2011-01-01 03:00:00      1       0          0       1 9.84 14.395
## 5 2011-01-01 04:00:00      1       0          0       1 9.84 14.395
## 6 2011-01-01 05:00:00      1       0          0       2 9.84 12.880
##   humidity windspeed casual registered count
## 1       81    0.0000      3         13    16
## 2       80    0.0000      8         32    40
## 3       80    0.0000      5         27    32
## 4       75    0.0000      3         10    13
## 5       75    0.0000      0          1     1
## 6       75    6.0032      0          1     1
##EDA


ggplot(bike, aes(temp, count))+geom_point(alpha=0.3, aes(color=temp))+ggtitle("Temperature vs Count")

bike$datetime<-as.POSIXct(bike$datetime)

ggplot(bike, aes(datetime, count))+geom_point(aes(color=temp), alpha=0.5)+ggtitle("Date vs Count and Temperature") ###+scale_color_continuous(low='grey', high='black')

ggplot(bike, aes(factor(season), count))+geom_boxplot((aes(color=factor(season))))+theme_bw()

bike$years<-format(bike$datetime, "%Y")

ggplot(bike, aes(factor(season), count))+geom_boxplot((aes(color=factor(season))))+theme_bw()+facet_grid(.~years)+ggtitle("By Season and Year")

bike$hour<-sapply(bike$datetime, function(x) {format(x,"%H")})


#Scatterplot

ggplot(filter(bike, workingday==1), aes(hour, count))+geom_point()+ggtitle("By Hour")

ggplot(filter(bike, workingday==1), aes(hour, count))+geom_point(aes(color=temp))+ggtitle("By Hour and Temperature")+scale_color_gradientn(colours=c('dark blue' , 'blue', 'light blue', 'yellow', 'orange', 'red'))

ggplot(filter(bike, workingday==1), aes(hour, count))+geom_point(position=position_jitter(w=1, h=0), aes(color=temp))+ggtitle("By Hour and Temperature")+scale_color_gradientn(colours=c('dark blue' , 'blue', 'light blue', 'yellow', 'orange', 'red'))

## build model

bike$hour<-as.numeric(bike$hour)
model<-lm(count~. -casual-registered-datetime-atemp, bike)

summary(model)
## 
## Call:
## lm(formula = count ~ . - casual - registered - datetime - atemp, 
##     data = bike)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -320.51  -94.50  -28.06   61.52  641.15 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.91084    8.30459  -0.712   0.4766    
## season       21.96683    1.30074  16.888   <2e-16 ***
## holiday     -13.35251    8.44476  -1.581   0.1139    
## workingday   -0.44450    3.02068  -0.147   0.8830    
## weather      -5.59844    2.40016  -2.333   0.0197 *  
## temp          6.68805    0.18413  36.321   <2e-16 ***
## humidity     -1.96842    0.08762 -22.464   <2e-16 ***
## windspeed     0.43625    0.17921   2.434   0.0149 *  
## years2012    82.76732    2.74059  30.201   <2e-16 ***
## hour          7.83550    0.20846  37.588   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142 on 10876 degrees of freedom
## Multiple R-squared:  0.3859, Adjusted R-squared:  0.3854 
## F-statistic: 759.3 on 9 and 10876 DF,  p-value: < 2.2e-16

Logistic Regression

df.train<-read.csv('titanic_train.csv')
print(head(df.train))
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q
print(str(df.train))
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## NULL
missmap(df.train, main='Missing Map', col=c('yellow', 'black'), legend = FALSE)

ggplot(df.train, aes(Survived))+geom_bar()

ggplot(df.train, aes(Pclass))+geom_bar(aes(fill=factor(Pclass)))

ggplot(df.train, aes(Sex))+geom_bar(aes(fill=Sex))

ggplot(df.train, aes(Age))+geom_histogram(alpha=0.5, fill='blue', bins=20)

ggplot(df.train, aes(SibSp))+geom_bar()

ggplot(df.train, aes(Fare))+geom_histogram(fill='green', color='black', alpha=0.5)

ggplot(df.train, aes(Pclass, Age))+geom_boxplot(aes(group=Pclass, fill=factor(Pclass)), alpha=0.2)+scale_y_continuous(breaks=seq(0, 80, by=2))

impute_age<-function(age, class){
  out<-age
  for (i in 1:length(age)) {
    if(is.na(age[i])) {
      if (class[i]==1) {
        out[i]<-37
      } else if (class[i]==2) {
        out[i]<-29
      } else {
        out[i]<-24
      }
    } else{
      out[i]<-age[i]
    }
  }
  return(out)
}

fixed.ages<-impute_age(df.train$Age, df.train$Pclass)
df.train$Age<-fixed.ages
#missmap(df.train, main='Missing Map', col=c('yellow', 'black'), legend = FALSE)

df.train<-dplyr::select(df.train, -PassengerId, -Name, -Ticket, -Cabin)

df.train$Survived<-factor(df.train$Survived)
df.train$Pclass<-factor(df.train$Pclass)
df.train$Parch<-factor(df.train$Parch)
df.train$SibSp<-factor(df.train$SibSp)

log.model<-glm(Survived~., family=binomial(link = 'logit'), data=df.train)
summary(log.model)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"), 
##     data = df.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8158  -0.6134  -0.4138   0.5808   2.4896  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.845e+01  1.660e+03   0.011 0.991134    
## Pclass2     -1.079e+00  3.092e-01  -3.490 0.000484 ***
## Pclass3     -2.191e+00  3.161e-01  -6.930 4.20e-12 ***
## Sexmale     -2.677e+00  2.040e-01 -13.123  < 2e-16 ***
## Age         -3.971e-02  8.758e-03  -4.534 5.79e-06 ***
## SibSp1       8.135e-02  2.245e-01   0.362 0.717133    
## SibSp2      -2.897e-01  5.368e-01  -0.540 0.589361    
## SibSp3      -2.241e+00  7.202e-01  -3.111 0.001862 ** 
## SibSp4      -1.675e+00  7.620e-01  -2.198 0.027954 *  
## SibSp5      -1.595e+01  9.588e+02  -0.017 0.986731    
## SibSp8      -1.607e+01  7.578e+02  -0.021 0.983077    
## Parch1       3.741e-01  2.895e-01   1.292 0.196213    
## Parch2       3.862e-02  3.824e-01   0.101 0.919560    
## Parch3       3.655e-01  1.056e+00   0.346 0.729318    
## Parch4      -1.586e+01  1.055e+03  -0.015 0.988007    
## Parch5      -1.152e+00  1.172e+00  -0.983 0.325771    
## Parch6      -1.643e+01  2.400e+03  -0.007 0.994536    
## Fare         2.109e-03  2.490e-03   0.847 0.397036    
## EmbarkedC   -1.458e+01  1.660e+03  -0.009 0.992995    
## EmbarkedQ   -1.456e+01  1.660e+03  -0.009 0.993001    
## EmbarkedS   -1.486e+01  1.660e+03  -0.009 0.992857    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  763.41  on 870  degrees of freedom
## AIC: 805.41
## 
## Number of Fisher Scoring iterations: 15
fitted.prob<-predict(log.model, data=df.train, type='response')
fitted.results<-ifelse(fitted.prob>0.5,1,0)
misClassErr0r<-mean(fitted.results!=df.train$Survived)
misClassErr0r
## [1] 0.1818182
#confusion matrix
table(df.train$Survived, fitted.prob>0.5 )
##    
##     FALSE TRUE
##   0   482   67
##   1    95  247
##Adult Example
adult<-read.csv('adult_sal.csv')
head(adult)
##   X age    type_employer fnlwgt education education_num            marital
## 1 1  39        State-gov  77516 Bachelors            13      Never-married
## 2 2  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3 3  38          Private 215646   HS-grad             9           Divorced
## 4 4  53          Private 234721      11th             7 Married-civ-spouse
## 5 5  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6 6  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital_gain capital_loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hr_per_week       country income
## 1          40 United-States  <=50K
## 2          13 United-States  <=50K
## 3          40 United-States  <=50K
## 4          40 United-States  <=50K
## 5          40          Cuba  <=50K
## 6          40 United-States  <=50K
##Data Cleaning/ feature engineering
#Combine employer type
unemp<-function(job){
  job<-as.character(job)
  if(job=='Never-worked'|job=='Without-pay') {
    return('Unemployed')
  } else {
    return(job)
  }
}

####
####
adult$type_employer<-sapply(adult$type_employer, unemp)
table(adult$type_employer)
## 
##                ?      Federal-gov        Local-gov          Private 
##             1836              960             2093            22696 
##     Self-emp-inc Self-emp-not-inc        State-gov       Unemployed 
##             1116             2541             1298               21
group_emp<-function(job){
  job<-as.character(job)
  if(job=='Local-gov'|job=='State-gov') {
    return('SL-gov')
  } else if(job=='Self-emp-inc'|job=='Self-emp-not-inc') {
    return('self-emp')
  } else {
    return(job)
  }
}

###
adult$type_employer<-sapply(adult$type_employer,group_emp)

KNN

##we use the class library
str(Caravan)
## 'data.frame':    5822 obs. of  86 variables:
##  $ MOSTYPE : num  33 37 37 9 40 23 39 33 33 11 ...
##  $ MAANTHUI: num  1 1 1 1 1 1 2 1 1 2 ...
##  $ MGEMOMV : num  3 2 2 3 4 2 3 2 2 3 ...
##  $ MGEMLEEF: num  2 2 2 3 2 1 2 3 4 3 ...
##  $ MOSHOOFD: num  8 8 8 3 10 5 9 8 8 3 ...
##  $ MGODRK  : num  0 1 0 2 1 0 2 0 0 3 ...
##  $ MGODPR  : num  5 4 4 3 4 5 2 7 1 5 ...
##  $ MGODOV  : num  1 1 2 2 1 0 0 0 3 0 ...
##  $ MGODGE  : num  3 4 4 4 4 5 5 2 6 2 ...
##  $ MRELGE  : num  7 6 3 5 7 0 7 7 6 7 ...
##  $ MRELSA  : num  0 2 2 2 1 6 2 2 0 0 ...
##  $ MRELOV  : num  2 2 4 2 2 3 0 0 3 2 ...
##  $ MFALLEEN: num  1 0 4 2 2 3 0 0 3 2 ...
##  $ MFGEKIND: num  2 4 4 3 4 5 3 5 3 2 ...
##  $ MFWEKIND: num  6 5 2 4 4 2 6 4 3 6 ...
##  $ MOPLHOOG: num  1 0 0 3 5 0 0 0 0 0 ...
##  $ MOPLMIDD: num  2 5 5 4 4 5 4 3 1 4 ...
##  $ MOPLLAAG: num  7 4 4 2 0 4 5 6 8 5 ...
##  $ MBERHOOG: num  1 0 0 4 0 2 0 2 1 2 ...
##  $ MBERZELF: num  0 0 0 0 5 0 0 0 1 0 ...
##  $ MBERBOER: num  1 0 0 0 4 0 0 0 0 0 ...
##  $ MBERMIDD: num  2 5 7 3 0 4 4 2 1 3 ...
##  $ MBERARBG: num  5 0 0 1 0 2 1 5 8 3 ...
##  $ MBERARBO: num  2 4 2 2 0 2 5 2 1 3 ...
##  $ MSKA    : num  1 0 0 3 9 2 0 2 1 1 ...
##  $ MSKB1   : num  1 2 5 2 0 2 1 1 1 2 ...
##  $ MSKB2   : num  2 3 0 1 0 2 4 2 0 1 ...
##  $ MSKC    : num  6 5 4 4 0 4 5 5 8 4 ...
##  $ MSKD    : num  1 0 0 0 0 2 0 2 1 2 ...
##  $ MHHUUR  : num  1 2 7 5 4 9 6 0 9 0 ...
##  $ MHKOOP  : num  8 7 2 4 5 0 3 9 0 9 ...
##  $ MAUT1   : num  8 7 7 9 6 5 8 4 5 6 ...
##  $ MAUT2   : num  0 1 0 0 2 3 0 4 2 1 ...
##  $ MAUT0   : num  1 2 2 0 1 3 1 2 3 2 ...
##  $ MZFONDS : num  8 6 9 7 5 9 9 6 7 6 ...
##  $ MZPART  : num  1 3 0 2 4 0 0 3 2 3 ...
##  $ MINKM30 : num  0 2 4 1 0 5 4 2 7 2 ...
##  $ MINK3045: num  4 0 5 5 0 2 3 5 2 3 ...
##  $ MINK4575: num  5 5 0 3 9 3 3 3 1 3 ...
##  $ MINK7512: num  0 2 0 0 0 0 0 0 0 1 ...
##  $ MINK123M: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MINKGEM : num  4 5 3 4 6 3 3 3 2 4 ...
##  $ MKOOPKLA: num  3 4 4 4 3 3 5 3 3 7 ...
##  $ PWAPART : num  0 2 2 0 0 0 0 0 0 2 ...
##  $ PWABEDR : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWALAND : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPERSAUT: num  6 0 6 6 0 6 6 0 5 0 ...
##  $ PBESAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PMOTSCO : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PVRAAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PAANHANG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTRACTOR: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWERKT  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBROM   : num  0 0 0 0 0 0 0 3 0 0 ...
##  $ PLEVEN  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPERSONG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PGEZONG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWAOREG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBRAND  : num  5 2 2 2 6 0 0 0 0 3 ...
##  $ PZEILPL : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPLEZIER: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PFIETS  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PINBOED : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBYSTAND: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWAPART : num  0 2 1 0 0 0 0 0 0 1 ...
##  $ AWABEDR : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWALAND : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ APERSAUT: num  1 0 1 1 0 1 1 0 1 0 ...
##  $ ABESAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMOTSCO : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AVRAAUT : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AAANHANG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ATRACTOR: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWERKT  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABROM   : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ ALEVEN  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ APERSONG: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGEZONG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWAOREG : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABRAND  : num  1 1 1 1 1 0 0 0 0 1 ...
##  $ AZEILPL : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ APLEZIER: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AFIETS  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AINBOED : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABYSTAND: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purchase: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
any(is.na(Caravan))
## [1] FALSE
purchase<-Caravan[,86]

standardized.Caravan<-scale(Caravan[,-86])

#test
test.index<-1:1000
test.data<-standardized.Caravan[test.index,]
test.purchase<-purchase[test.index]

#train
train.data<-standardized.Caravan[-test.index,]
train.purchase<-purchase[-test.index]


##KNN Model

predicted.purchase<-knn(train.data, test.data, train.purchase, k=1 )

misClassErr0r<-mean(test.purchase!=predicted.purchase)
misClassErr0r
## [1] 0.116
###Choosing a K-VALUE
predicted.purchase<-knn(train.data, test.data, train.purchase, k=3 )

misClassErr0r<-mean(test.purchase!=predicted.purchase)
misClassErr0r
## [1] 0.074
predicted.purchase<-knn(train.data, test.data, train.purchase, k=5 )

misClassErr0r<-mean(test.purchase!=predicted.purchase)
misClassErr0r
## [1] 0.066
predicted.purchase<-NULL
error.rate<-NULL

for (i in 1:20) {
  predicted.purchase<-knn(train.data, test.data, train.purchase, k=i )
  error.rate[i]<-mean(test.purchase!=predicted.purchase)
}

error.rate
##  [1] 0.114 0.113 0.074 0.071 0.066 0.066 0.062 0.060 0.058 0.059 0.059
## [12] 0.059 0.059 0.059 0.059 0.059 0.059 0.059 0.059 0.059
##Visualize k elbow method
k.values<-1:20
error.df<-data.frame(error.rate, k.values)
ggplot(error.df, aes(k.values, error.rate))+geom_point()+geom_line(lty='dotted', color='red')

##KNN on iris dataset
stand_features<-scale(iris[, 1:4])
final_data<-cbind(stand_features, iris[5])

### Train Test Split
set.seed((101))

sample<-sample.split(final_data$Species, SplitRati=0.7)
train<-subset(final_data, sample==T)
test<-subset(final_data, sample==F)

predicted_species<-knn(train[1:4], test[1:4], train$Species, k=1)
predicted_species
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor virginica  versicolor versicolor versicolor
## [25] versicolor versicolor virginica  versicolor versicolor versicolor
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica
mean(test$Species!=predicted_species)
## [1] 0.04444444
predicted.species<-NULL
error.rate<-NULL

for (i in 1:10) {
  predicted.species<-knn(train[1:4], test[1:4], train$Species, k=i )
  error.rate[i]<-mean(test$Species!=predicted.species)
}

error.rate
##  [1] 0.04444444 0.06666667 0.02222222 0.02222222 0.02222222 0.04444444
##  [7] 0.04444444 0.04444444 0.04444444 0.02222222
k.values<-1:10
error.df<-data.frame(error.rate, k.values)
ggplot(error.df, aes(k.values, error.rate))+geom_point()+geom_line(lty='dotted', color='red')

Decision Tree and Random Forest

##library(rpart); library(rpart.plot)
str(kyphosis)
## 'data.frame':    81 obs. of  4 variables:
##  $ Kyphosis: Factor w/ 2 levels "absent","present": 1 1 2 1 1 1 1 1 1 2 ...
##  $ Age     : int  71 158 128 2 1 1 61 37 113 59 ...
##  $ Number  : int  3 3 4 5 4 2 2 3 2 6 ...
##  $ Start   : int  5 14 5 1 15 16 17 16 16 12 ...
head(kyphosis)
##   Kyphosis Age Number Start
## 1   absent  71      3     5
## 2   absent 158      3    14
## 3  present 128      4     5
## 4   absent   2      5     1
## 5   absent   1      4    15
## 6   absent   1      2    16
tree<-rpart(Kyphosis~., method='class', data=kyphosis)

#printcp(tree)
#plotcp(tree)
#rsq.rpart(tree)
#print(tree)
#summary(tree)
#plot(tree)
#text(tree)
#post(tree)

printcp(tree)
## 
## Classification tree:
## rpart(formula = Kyphosis ~ ., data = kyphosis, method = "class")
## 
## Variables actually used in tree construction:
## [1] Age   Start
## 
## Root node error: 17/81 = 0.20988
## 
## n= 81 
## 
##         CP nsplit rel error xerror    xstd
## 1 0.176471      0   1.00000 1.0000 0.21559
## 2 0.019608      1   0.82353 1.0588 0.22010
## 3 0.010000      4   0.76471 1.0588 0.22010
plot(tree, uniform=T, main='Kyphosis Tree')
text(tree, use.n=T, all=T)

prp(tree)

###Random Forest
rf.model<-randomForest(Kyphosis~., data=kyphosis)
print(rf.model)
## 
## Call:
##  randomForest(formula = Kyphosis ~ ., data = kyphosis) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 20.99%
## Confusion matrix:
##         absent present class.error
## absent      60       4   0.0625000
## present     13       4   0.7647059
head(rf.model$predicted)
##       1       2       3       4       5       6 
## present  absent present  absent  absent  absent 
## Levels: absent present
rf.model$ntree
## [1] 500
##Project in College data
df<-College
ggplot(df, aes(Room.Board, Grad.Rate))+geom_point(aes(color=Private), size=2, alpha=0.4)

ggplot(df, aes(F.Undergrad))+geom_histogram(aes(fill=Private), color='black', bins=50)+theme_bw()

ggplot(df, aes(Grad.Rate))+geom_histogram(aes(fill=Private), color='black', bins=50)+theme_bw()

df['Cazenovia College', 'Grad.Rate']<-100
sample<-sample.split(df$Private, SplitRatio = 0.7)
train<-subset(df, sample==T)
test<-subset(df, sample==F)

tree<-rpart(Private~.,  method='class', data=train)
tree_preds<-predict(tree, test)
tree_preds_ii<-predict(tree, test, type='class')
head(tree_preds)
##                                               No        Yes
## Adrian College                       0.006389776 0.99361022
## Albion College                       0.006389776 0.99361022
## Alma College                         0.006389776 0.99361022
## Arizona State University Main campus 0.962616822 0.03738318
## Augustana College                    0.006389776 0.99361022
## Averett College                      0.006389776 0.99361022
head(tree_preds_ii)
##                       Adrian College                       Albion College 
##                                  Yes                                  Yes 
##                         Alma College Arizona State University Main campus 
##                                  Yes                                   No 
##                    Augustana College                      Averett College 
##                                  Yes                                  Yes 
## Levels: No Yes
prp(tree)

##Random Forest
rf.model<-randomForest(Private~., data=train, importance=TRUE)
rf.model$confusion
##      No Yes class.error
## No  128  20  0.13513514
## Yes   8 388  0.02020202
rf.model$importance
##                        No          Yes MeanDecreaseAccuracy
## Apps         0.0226552152 1.515865e-02         0.0171944485
## Accept       0.0270621202 1.460876e-02         0.0177891556
## Enroll       0.0500211210 3.658328e-02         0.0402308707
## Top10perc    0.0109724105 5.355321e-03         0.0068637579
## Top25perc    0.0066073620 3.408238e-03         0.0042275130
## F.Undergrad  0.1455889026 6.798197e-02         0.0885207222
## P.Undergrad  0.0480892495 5.540569e-03         0.0171310932
## Outstate     0.1409716025 5.407739e-02         0.0775064866
## Room.Board   0.0172985234 1.875991e-02         0.0182662819
## Books       -0.0007335231 8.187665e-05        -0.0001416182
## Personal     0.0035158166 1.096868e-03         0.0017141120
## PhD          0.0113309079 7.659406e-03         0.0086020779
## Terminal     0.0076668023 5.099660e-03         0.0057711953
## S.F.Ratio    0.0535742738 9.558044e-03         0.0215089669
## perc.alumni  0.0207132876 3.019087e-03         0.0078639402
## Expend       0.0228578681 1.125143e-02         0.0143516966
## Grad.Rate    0.0160787996 3.998308e-03         0.0072360887
##             MeanDecreaseGini
## Apps                9.646598
## Accept             11.528359
## Enroll             27.187132
## Top10perc           4.953006
## Top25perc           4.164820
## F.Undergrad        44.613182
## P.Undergrad        13.643002
## Outstate           36.334580
## Room.Board         12.629637
## Books               1.484925
## Personal            3.456073
## PhD                 4.358535
## Terminal            3.138481
## S.F.Ratio          19.325281
## perc.alumni         4.676900
## Expend              9.016619
## Grad.Rate           5.329973
rf_preds<-predict(rf.model, test)
table(rf_preds, test$Private)
##         
## rf_preds  No Yes
##      No   53   8
##      Yes  11 161

SVM

model<-svm(Species~., data=iris)
summary(model)
## 
## Call:
## svm(formula = Species ~ ., data = iris)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.25 
## 
## Number of Support Vectors:  51
## 
##  ( 8 22 21 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  setosa versicolor virginica
pred_values<-predict(model, iris[1:4])
table(pred_values, iris$Species)
##             
## pred_values  setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48
###Tune Results
tune_results<-tune(svm, train.x = iris[1:4], train.y=iris[,5], kernel='radial', ranges=list(cost=c(0.1,1,10), gamma=c(0.5,1,2)))
summary(tune_results)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.5
## 
## - best performance: 0.04 
## 
## - Detailed performance results:
##   cost gamma      error dispersion
## 1  0.1   0.5 0.06000000 0.04919099
## 2  1.0   0.5 0.04000000 0.04661373
## 3 10.0   0.5 0.04666667 0.05488484
## 4  0.1   1.0 0.06666667 0.06285394
## 5  1.0   1.0 0.05333333 0.05258738
## 6 10.0   1.0 0.06000000 0.05837300
## 7  0.1   2.0 0.09333333 0.06440612
## 8  1.0   2.0 0.04666667 0.05488484
## 9 10.0   2.0 0.05333333 0.06126244
tuned_svm<-svm(Species~., data=iris, kernel='radial', cost=1.5, gamma=0.1)
summary(tuned_svm)
## 
## Call:
## svm(formula = Species ~ ., data = iris, kernel = "radial", cost = 1.5, 
##     gamma = 0.1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1.5 
##       gamma:  0.1 
## 
## Number of Support Vectors:  50
## 
##  ( 4 23 23 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  setosa versicolor virginica
pred_values<-predict(tuned_svm, iris[1:4])
table(pred_values, iris$Species)
##             
## pred_values  setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         1
##   virginica       0          2        49
### SVM Project
loans<-read.csv('loan_data.csv')
str(loans)
## 'data.frame':    9578 obs. of  14 variables:
##  $ credit.policy    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ purpose          : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
##  $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
##  $ installment      : num  829 228 367 162 103 ...
##  $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
##  $ dti              : num  19.5 14.3 11.6 8.1 15 ...
##  $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
##  $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
##  $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
##  $ revol.util       : num  52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
##  $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ delinq.2yrs      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pub.rec          : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...
##Convert to factors

loans$credit.policy<-factor(loans$credit.policy)
loans$inq.last.6mths<-factor(loans$inq.last.6mths)
loans$delinq.2yrs<-factor(loans$delinq.2yrs)
loans$pub.rec<-factor(loans$pub.rec)
loans$not.fully.paid<-factor(loans$not.fully.paid)

pl<-ggplot(loans, aes(fico))+geom_histogram(aes(fill=not.fully.paid), color='black', bins = 40, alpha=0.5)+theme_bw()
pl

pl+scale_fill_manual(values=c('green', 'red'))

ggplot(loans, aes(x=factor(purpose)))+geom_bar(aes(fill=not.fully.paid), position='dodge')

ggplot(loans, aes(int.rate, fico))+geom_point(aes(color=not.fully.paid), alpha=0.4)

###TRAIN TEST SPLIT

sample<-sample.split(loans$not.fully.paid, 0.7)
train<-subset(loans, sample==T)
test<-subset(loans, sample==F)

model<-svm(not.fully.paid~., data=train)

summary(model)
## 
## Call:
## svm(formula = not.fully.paid ~ ., data = train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01724138 
## 
## Number of Support Vectors:  2822
## 
##  ( 1749 1073 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
predicted_values<-predict(model, test[1:13])
table(predicted_values, test$not.fully.paid)
##                 
## predicted_values    0    1
##                0 2413  460
##                1    0    0
tuned.results<-tune(svm, train.x=not.fully.paid~., data=train, kernel='radial', ranges=list(cost=c(100,200), gamma=c(0.1)))
summary(tuned.results)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   100   0.1
## 
## - best performance: 0.2147648 
## 
## - Detailed performance results:
##   cost gamma     error dispersion
## 1  100   0.1 0.2147648 0.01688813
## 2  200   0.1 0.2266982 0.01884881
tuned_model<-svm(not.fully.paid~., data=train, cost=100, gamma=0.1)
tuned.predictions<-predict(tuned_model, test[1:13])
table(tuned.predictions, test$not.fully.paid)
##                  
## tuned.predictions    0    1
##                 0 2198  384
##                 1  215   76

K Means Clustering

ggplot(iris, aes(Petal.Length, Petal.Width, color=Species))+geom_point(size=4)

iriscluster<-kmeans(iris[,1:4], 3, nstart = 20)
print(iriscluster)
## K-means clustering with 3 clusters of sizes 38, 50, 62
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     6.850000    3.073684     5.742105    2.071053
## 2     5.006000    3.428000     1.462000    0.246000
## 3     5.901613    2.748387     4.393548    1.433871
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1
## [106] 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1
## [141] 1 1 3 1 1 1 3 1 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 23.87947 15.15100 39.82097
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
table(iriscluster$cluster, iris$Species)
##    
##     setosa versicolor virginica
##   1      0          2        36
##   2     50          0         0
##   3      0         48        14
clusplot(iris, iriscluster$cluster, color=T, shade=T, labels = 0, lines=0)

##Project in K means
df1<-read.csv('winequality-red.csv', sep=';')
df2<-read.csv('winequality-white.csv', sep=';')
head(df1)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5
df1$label<-c('red')
df2$label<-c('white')
wine<-rbind(df1,df2)

ggplot(wine, aes(residual.sugar))+geom_histogram(aes(fill=label), color='black', bins=50)+scale_fill_manual(values=c('dark red', 'white'))

ggplot(wine, aes(citric.acid))+geom_histogram(aes(fill=label), color='black', bins=50)+scale_fill_manual(values=c('dark red', 'white'))

ggplot(wine, aes(alcohol))+geom_histogram(aes(fill=label), color='black', bins=50)+scale_fill_manual(values=c('dark red', 'white'))

ggplot(wine, aes(citric.acid, residual.sugar))+geom_point(aes(color=label), alpha=0.2)+scale_color_manual(values=c('dark red', 'white'))+theme_dark()

ggplot(wine, aes(volatile.acidity, residual.sugar))+geom_point(aes(color=label), alpha=0.2)+scale_color_manual(values=c('dark red', 'white'))+theme_dark()

clusdata<-wine[,1:12]
winecluster<-kmeans(clusdata, 2)
winecluster$centers
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1      6.904698        0.2871364   0.3398094       7.259286 0.0486092
## 2      7.619044        0.4079451   0.2911080       3.082690 0.0656846
##   free.sulfur.dioxide total.sulfur.dioxide   density       pH sulphates
## 1            39.82503            155.90101 0.9947956 3.190308 0.5000354
## 2            18.43735             63.54832 0.9945680 3.255147 0.5718655
##    alcohol  quality
## 1 10.25832 5.825436
## 2 10.79529 5.809204
table(wine$labe, winecluster$cluster)
##        
##            1    2
##   red     84 1515
##   white 3588 1310

NLP

#Connect to twitter
ckey=c('WJz1DjXp8ZYwcBMy4cbiTS9mX')
skey=c('27sGwCB9qtEgnSaLv4Z5hZKjuX9PyUkr774CMVx016cmHOMrlM')
token=c('313313903-nZbOUAuHg4B8HNhuQBPPnXJ1oxHisezKT8g9OtvS')
sectoken=c('oVMp0Xn1yIr1nirqHKD18omFAHELIe9ocv3Jw9WWwfGCp')
setup_twitter_oauth(ckey, skey, token, sectoken)
## [1] "Using direct authentication"
#Search for soccer
soccer_tweeks<-searchTwitter('soccer', n=1000, lang='en')
soccer_text<-sapply(soccer_tweeks, function(x) x$getText())

#Clean Data
soccer_text<-iconv(soccer_text, 'UTF-8', 'ASCII')

soccer_corpus<-Corpus(VectorSource(soccer_text))

#Document Term Matrix
term_doc_metrix<-TermDocumentMatrix(soccer_corpus,
                                      control = list(removePunctuation = TRUE,
                                                     stopwords = c("soccer","http", stopwords("english")),
                                                     removeNumbers = TRUE,tolower = TRUE))

#AS MATRIX

term.doc.matrix <- as.matrix(term_doc_metrix)
#Get words counts
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE) 
dm <- data.frame(word=names(word.freqs), freq=word.freqs)

wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#Search for python
soccer_tweeks<-searchTwitter('python', n=1000, lang='en')
soccer_text<-sapply(soccer_tweeks, function(x) x$getText())

#Clean Data
soccer_text<-iconv(soccer_text, 'UTF-8', 'ASCII')

soccer_corpus<-Corpus(VectorSource(soccer_text))

#Document Term Matrix
term_doc_metrix<-TermDocumentMatrix(soccer_corpus,
                                      control = list(removePunctuation = TRUE,
                                                     stopwords = c("python","http", stopwords("english")),
                                                     removeNumbers = TRUE,tolower = TRUE))

#AS MATRIX

term.doc.matrix <- as.matrix(term_doc_metrix)
#Get words counts
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE) 
dm <- data.frame(word=names(word.freqs), freq=word.freqs)

wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

Neural Networks

head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
data<-Boston

#Normalize our data
maxs<-apply(data, 2, max)
mins<-apply(data, 2, min)

scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))
head(scaled)
##           crim   zn      indus chas       nox        rm       age
## 1 0.0000000000 0.18 0.06781525    0 0.3148148 0.5775053 0.6416066
## 2 0.0002359225 0.00 0.24230205    0 0.1728395 0.5479977 0.7826982
## 3 0.0002356977 0.00 0.24230205    0 0.1728395 0.6943859 0.5993821
## 4 0.0002927957 0.00 0.06304985    0 0.1502058 0.6585553 0.4418126
## 5 0.0007050701 0.00 0.06304985    0 0.1502058 0.6871048 0.5283213
## 6 0.0002644715 0.00 0.06304985    0 0.1502058 0.5497222 0.5746653
##         dis        rad        tax   ptratio     black      lstat      medv
## 1 0.2692031 0.00000000 0.20801527 0.2872340 1.0000000 0.08967991 0.4222222
## 2 0.3489620 0.04347826 0.10496183 0.5531915 1.0000000 0.20447020 0.3688889
## 3 0.3489620 0.04347826 0.10496183 0.5531915 0.9897373 0.06346578 0.6600000
## 4 0.4485446 0.08695652 0.06679389 0.6489362 0.9942761 0.03338852 0.6311111
## 5 0.4485446 0.08695652 0.06679389 0.6489362 1.0000000 0.09933775 0.6933333
## 6 0.4485446 0.08695652 0.06679389 0.6489362 0.9929901 0.09602649 0.5266667
##Train and Test Sets

split = sample.split(scaled$medv, SplitRatio = 0.70)

train = subset(scaled, split == TRUE)
test = subset(scaled, split == FALSE)

#For some odd reasons, the neuralnet() function won't accept a formula in the form: y~. that we are used to using. Instead you have to call all the columns added together. Here is some #convience code to help quickly create that formula:

# Get column names
n <- names(train)

# Paste together
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))

##Becasue it is continuous and not classification we set linear.output=TRUE
nn <- neuralnet(f,data=train,hidden=c(5,3),linear.output=TRUE)
plot(nn)

###Predictions

# Compute Predictions off Test Set
predicted.nn.values <- compute(nn,test[1:13])

# Convert back to non-scaled predictions
true.predictions <- predicted.nn.values$net.result*(max(data$medv)-min(data$medv))+min(data$medv)

# Convert the test data
test.r <- (test$medv)*(max(data$medv)-min(data$medv))+min(data$medv)



# Check the Mean Squared Error
MSE.nn <- sum((test.r - true.predictions)^2)/nrow(test)
MSE.nn
## [1] 13.37068543
#Visualize Error
error.df <- data.frame(test.r,true.predictions)

ggplot(error.df,aes(x=test.r,y=true.predictions)) + geom_point() + stat_smooth()


########
###Neural Netwrok Project
df <- read.csv('bank_note_data.csv')
head(df)
##   Image.Var Image.Skew Image.Curt  Entropy Class
## 1   3.62160     8.6661    -2.8073 -0.44699     0
## 2   4.54590     8.1674    -2.4586 -1.46210     0
## 3   3.86600    -2.6383     1.9242  0.10645     0
## 4   3.45660     9.5228    -4.0112 -3.59440     0
## 5   0.32924    -4.4552     4.5718 -0.98880     0
## 6   4.36840     9.6718    -3.9606 -3.16250     0
#TRAIN AND TEST SPLIT
set.seed(101)
split = sample.split(df$Class, SplitRatio = 0.70)

train = subset(df, split == TRUE)
test = subset(df, split == FALSE)

nn <- neuralnet(Class ~ Image.Var + Image.Skew + Image.Curt + Entropy,data=train,hidden=10,linear.output=FALSE)
predicted.nn.values <- compute(nn,test[,1:4])
head(predicted.nn.values$net.result)
##                 [,1]
## 3  0.000015994811962
## 11 0.000037138515521
## 12 0.000006944038291
## 13 0.000001799145605
## 14 0.000003469964563
## 17 0.000002957116883
#Apply the round function to the predicted values so you only 0s and 1s as your predicted classes.
predictions <- sapply(predicted.nn.values$net.result,round)

head(predictions)
## [1] 0 0 0 0 0 0
table(predictions,test$Class)
##            
## predictions   0   1
##           0 229   0
##           1   0 183
##You should have noticed that you did very well! Almost suspiciously well! Let's check our results against a randomForest model!
##Comparing Models

df$Class <- factor(df$Class)

set.seed(101)
split = sample.split(df$Class, SplitRatio = 0.70)

train = subset(df, split == TRUE)
test = subset(df, split == FALSE)

model <- randomForest(Class ~ Image.Var + Image.Skew + Image.Curt + Entropy,data=train)
rf.pred <- predict(model,test)
table(rf.pred,test$Class)
##        
## rf.pred   0   1
##       0 227   1
##       1   2 182
#Use predict() to get the predicted values from your rf model.

rf.pred <- predict(model,test)