library(ggplot2)
library(ggthemes)
library(dplyr)
library(corrgram)
library(corrplot)
library(caTools)
library(Amelia)
library(ISLR)
library(class)
library(rpart)
library(rpart.plot)
library(randomForest)
library(e1071)
library(cluster)
library(tm)
library(twitteR)
library(RColorBrewer)
library(wordcloud)
library(MASS)
library(neuralnet)
Linear Regression
df<-read.csv("student-mat.csv", sep=";")
head(df)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 GP F 15 U LE3 T 1 1 at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 services other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 yes no no
## 2 course father 1 2 0 no yes no
## 3 other mother 1 2 3 yes no yes
## 4 home mother 1 3 0 no yes yes
## 5 home father 1 2 0 no yes yes
## 6 reputation mother 1 2 0 no yes yes
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no no 4 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes yes yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## Walc health absences G1 G2 G3
## 1 1 3 6 5 6 6
## 2 1 3 4 5 5 6
## 3 3 3 10 7 8 10
## 4 1 5 2 15 14 15
## 5 2 5 4 6 10 10
## 6 2 5 10 15 15 15
summary(df)
## school sex age address famsize Pstatus Medu
## GP:349 F:208 Min. :15.0 R: 88 GT3:281 A: 41 Min. :0.000
## MS: 46 M:187 1st Qu.:16.0 U:307 LE3:114 T:354 1st Qu.:2.000
## Median :17.0 Median :3.000
## Mean :16.7 Mean :2.749
## 3rd Qu.:18.0 3rd Qu.:4.000
## Max. :22.0 Max. :4.000
## Fedu Mjob Fjob reason
## Min. :0.000 at_home : 59 at_home : 20 course :145
## 1st Qu.:2.000 health : 34 health : 18 home :109
## Median :2.000 other :141 other :217 other : 36
## Mean :2.522 services:103 services:111 reputation:105
## 3rd Qu.:3.000 teacher : 58 teacher : 29
## Max. :4.000
## guardian traveltime studytime failures schoolsup
## father: 90 Min. :1.000 Min. :1.000 Min. :0.0000 no :344
## mother:273 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 yes: 51
## other : 32 Median :1.000 Median :2.000 Median :0.0000
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery higher internet romantic
## no :153 no :214 no :194 no : 81 no : 20 no : 66 no :263
## yes:242 yes:181 yes:201 yes:314 yes:375 yes:329 yes:132
##
##
##
##
## famrel freetime goout Dalc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000
## Median :4.000 Median :3.000 Median :3.000 Median :1.000
## Mean :3.944 Mean :3.235 Mean :3.109 Mean :1.481
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Walc health absences G1
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. : 3.00
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00
## Median :2.000 Median :4.000 Median : 4.000 Median :11.00
## Mean :2.291 Mean :3.554 Mean : 5.709 Mean :10.91
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00
## Max. :5.000 Max. :5.000 Max. :75.000 Max. :19.00
## G2 G3
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.00 1st Qu.: 8.00
## Median :11.00 Median :11.00
## Mean :10.71 Mean :10.42
## 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :19.00 Max. :20.00
#Check for NA
any(is.na(df))
## [1] FALSE
#Structure of Data
str(df)
## 'data.frame': 395 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : int 6 6 10 15 10 15 11 6 19 15 ...
#Numeric only
num.cols<-sapply(df, is.numeric)
#filter the data and calculate the correlation matrix
corr.data<-cor(df[,num.cols])
corrplot(corr.data, method='color')

#Corrgram can deal with factros too
corrgram(df)

corrgram(df, order=TRUE)

corrgram(df, order=TRUE, upper.pane=panel.pie)

#A ggplot
ggplot(df, aes(x=G3))+geom_histogram(bins=20, alpha=0.5, fill="green")

split the Data into Train and Test set
set.seed(101)
#Split up Sample
sample<-sample.split(df$G3,SplitRatio = 0.7)
#70% of the data
train<-subset(df, sample==TRUE)
#30% of the data
test<-subset(df, sample==FALSE)
#Train & build the model
model<-lm(G3~., data=train)
summary(model)
##
## Call:
## lm(formula = G3 ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4250 -0.6478 0.2844 1.0442 4.9840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.70763 2.69488 1.376 0.17019
## schoolMS 0.66981 0.47436 1.412 0.15926
## sexM 0.25730 0.29257 0.879 0.38006
## age -0.36163 0.12949 -2.793 0.00566 **
## addressU 0.08123 0.35652 0.228 0.81996
## famsizeLE3 0.12222 0.28709 0.426 0.67070
## PstatusT 0.06807 0.43032 0.158 0.87444
## Medu 0.11100 0.18757 0.592 0.55455
## Fedu -0.16373 0.15928 -1.028 0.30503
## Mjobhealth -0.63993 0.65314 -0.980 0.32820
## Mjobother -0.15730 0.42323 -0.372 0.71048
## Mjobservices -0.15872 0.46682 -0.340 0.73415
## Mjobteacher -0.04930 0.62335 -0.079 0.93702
## Fjobhealth 0.17565 0.83034 0.212 0.83265
## Fjobother -0.29559 0.56012 -0.528 0.59818
## Fjobservices -0.76964 0.59476 -1.294 0.19692
## Fjobteacher -0.27009 0.73824 -0.366 0.71480
## reasonhome -0.41126 0.31857 -1.291 0.19799
## reasonother 0.06767 0.45323 0.149 0.88144
## reasonreputation 0.13478 0.34735 0.388 0.69834
## guardianmother -0.05442 0.31663 -0.172 0.86369
## guardianother 0.01588 0.58375 0.027 0.97832
## traveltime -0.02353 0.19540 -0.120 0.90427
## studytime -0.04294 0.16910 -0.254 0.79979
## failures -0.17219 0.19668 -0.875 0.38220
## schoolsupyes 0.20742 0.42358 0.490 0.62481
## famsupyes -0.05329 0.27753 -0.192 0.84789
## paidyes 0.31311 0.28284 1.107 0.26941
## activitiesyes -0.26104 0.26687 -0.978 0.32901
## nurseryyes -0.05345 0.31236 -0.171 0.86428
## higheryes -0.94298 0.74005 -1.274 0.20385
## internetyes -0.15834 0.37029 -0.428 0.66932
## romanticyes -0.30048 0.28115 -1.069 0.28627
## famrel 0.36601 0.14609 2.505 0.01291 *
## freetime 0.08386 0.14247 0.589 0.55668
## goout -0.12457 0.13306 -0.936 0.35015
## Dalc -0.16995 0.20659 -0.823 0.41153
## Walc 0.21053 0.14963 1.407 0.16074
## health 0.07805 0.09341 0.836 0.40426
## absences 0.09547 0.02382 4.008 8.24e-05 ***
## G1 0.14259 0.07892 1.807 0.07206 .
## G2 0.98859 0.06929 14.267 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.962 on 235 degrees of freedom
## Multiple R-squared: 0.8456, Adjusted R-squared: 0.8187
## F-statistic: 31.39 on 41 and 235 DF, p-value: < 2.2e-16
#Predictions
G3.predictions<-predict(model,test)
results<-cbind(G3.predictions, test$G3)
colnames(results)<-c('predicted', 'actual')
results<-as.data.frame(results)
#Take care of negative values
to_zero<- function(x){
if (x<0) {
return(0)
} else {return(x)}
}
results$predicted<-sapply(results$predicted, to_zero)
mse<-mean((results$predicted-results$actual)^2)
rmse<-sqrt(mse)
mse
## [1] 3.991675
rmse
## [1] 1.997918
Bike Project from Kaggle
bike<-read.csv('bikeshare.csv')
head(bike)
## datetime season holiday workingday weather temp atemp
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.395
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.635
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.635
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.395
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.395
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.880
## humidity windspeed casual registered count
## 1 81 0.0000 3 13 16
## 2 80 0.0000 8 32 40
## 3 80 0.0000 5 27 32
## 4 75 0.0000 3 10 13
## 5 75 0.0000 0 1 1
## 6 75 6.0032 0 1 1
##EDA
ggplot(bike, aes(temp, count))+geom_point(alpha=0.3, aes(color=temp))+ggtitle("Temperature vs Count")

bike$datetime<-as.POSIXct(bike$datetime)
ggplot(bike, aes(datetime, count))+geom_point(aes(color=temp), alpha=0.5)+ggtitle("Date vs Count and Temperature") ###+scale_color_continuous(low='grey', high='black')

ggplot(bike, aes(factor(season), count))+geom_boxplot((aes(color=factor(season))))+theme_bw()

bike$years<-format(bike$datetime, "%Y")
ggplot(bike, aes(factor(season), count))+geom_boxplot((aes(color=factor(season))))+theme_bw()+facet_grid(.~years)+ggtitle("By Season and Year")

bike$hour<-sapply(bike$datetime, function(x) {format(x,"%H")})
#Scatterplot
ggplot(filter(bike, workingday==1), aes(hour, count))+geom_point()+ggtitle("By Hour")

ggplot(filter(bike, workingday==1), aes(hour, count))+geom_point(aes(color=temp))+ggtitle("By Hour and Temperature")+scale_color_gradientn(colours=c('dark blue' , 'blue', 'light blue', 'yellow', 'orange', 'red'))

ggplot(filter(bike, workingday==1), aes(hour, count))+geom_point(position=position_jitter(w=1, h=0), aes(color=temp))+ggtitle("By Hour and Temperature")+scale_color_gradientn(colours=c('dark blue' , 'blue', 'light blue', 'yellow', 'orange', 'red'))

## build model
bike$hour<-as.numeric(bike$hour)
model<-lm(count~. -casual-registered-datetime-atemp, bike)
summary(model)
##
## Call:
## lm(formula = count ~ . - casual - registered - datetime - atemp,
## data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -320.51 -94.50 -28.06 61.52 641.15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.91084 8.30459 -0.712 0.4766
## season 21.96683 1.30074 16.888 <2e-16 ***
## holiday -13.35251 8.44476 -1.581 0.1139
## workingday -0.44450 3.02068 -0.147 0.8830
## weather -5.59844 2.40016 -2.333 0.0197 *
## temp 6.68805 0.18413 36.321 <2e-16 ***
## humidity -1.96842 0.08762 -22.464 <2e-16 ***
## windspeed 0.43625 0.17921 2.434 0.0149 *
## years2012 82.76732 2.74059 30.201 <2e-16 ***
## hour 7.83550 0.20846 37.588 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 142 on 10876 degrees of freedom
## Multiple R-squared: 0.3859, Adjusted R-squared: 0.3854
## F-statistic: 759.3 on 9 and 10876 DF, p-value: < 2.2e-16
Logistic Regression
df.train<-read.csv('titanic_train.csv')
print(head(df.train))
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
print(str(df.train))
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## NULL
missmap(df.train, main='Missing Map', col=c('yellow', 'black'), legend = FALSE)

ggplot(df.train, aes(Survived))+geom_bar()

ggplot(df.train, aes(Pclass))+geom_bar(aes(fill=factor(Pclass)))

ggplot(df.train, aes(Sex))+geom_bar(aes(fill=Sex))

ggplot(df.train, aes(Age))+geom_histogram(alpha=0.5, fill='blue', bins=20)

ggplot(df.train, aes(SibSp))+geom_bar()

ggplot(df.train, aes(Fare))+geom_histogram(fill='green', color='black', alpha=0.5)

ggplot(df.train, aes(Pclass, Age))+geom_boxplot(aes(group=Pclass, fill=factor(Pclass)), alpha=0.2)+scale_y_continuous(breaks=seq(0, 80, by=2))

impute_age<-function(age, class){
out<-age
for (i in 1:length(age)) {
if(is.na(age[i])) {
if (class[i]==1) {
out[i]<-37
} else if (class[i]==2) {
out[i]<-29
} else {
out[i]<-24
}
} else{
out[i]<-age[i]
}
}
return(out)
}
fixed.ages<-impute_age(df.train$Age, df.train$Pclass)
df.train$Age<-fixed.ages
#missmap(df.train, main='Missing Map', col=c('yellow', 'black'), legend = FALSE)
df.train<-dplyr::select(df.train, -PassengerId, -Name, -Ticket, -Cabin)
df.train$Survived<-factor(df.train$Survived)
df.train$Pclass<-factor(df.train$Pclass)
df.train$Parch<-factor(df.train$Parch)
df.train$SibSp<-factor(df.train$SibSp)
log.model<-glm(Survived~., family=binomial(link = 'logit'), data=df.train)
summary(log.model)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"),
## data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8158 -0.6134 -0.4138 0.5808 2.4896
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.845e+01 1.660e+03 0.011 0.991134
## Pclass2 -1.079e+00 3.092e-01 -3.490 0.000484 ***
## Pclass3 -2.191e+00 3.161e-01 -6.930 4.20e-12 ***
## Sexmale -2.677e+00 2.040e-01 -13.123 < 2e-16 ***
## Age -3.971e-02 8.758e-03 -4.534 5.79e-06 ***
## SibSp1 8.135e-02 2.245e-01 0.362 0.717133
## SibSp2 -2.897e-01 5.368e-01 -0.540 0.589361
## SibSp3 -2.241e+00 7.202e-01 -3.111 0.001862 **
## SibSp4 -1.675e+00 7.620e-01 -2.198 0.027954 *
## SibSp5 -1.595e+01 9.588e+02 -0.017 0.986731
## SibSp8 -1.607e+01 7.578e+02 -0.021 0.983077
## Parch1 3.741e-01 2.895e-01 1.292 0.196213
## Parch2 3.862e-02 3.824e-01 0.101 0.919560
## Parch3 3.655e-01 1.056e+00 0.346 0.729318
## Parch4 -1.586e+01 1.055e+03 -0.015 0.988007
## Parch5 -1.152e+00 1.172e+00 -0.983 0.325771
## Parch6 -1.643e+01 2.400e+03 -0.007 0.994536
## Fare 2.109e-03 2.490e-03 0.847 0.397036
## EmbarkedC -1.458e+01 1.660e+03 -0.009 0.992995
## EmbarkedQ -1.456e+01 1.660e+03 -0.009 0.993001
## EmbarkedS -1.486e+01 1.660e+03 -0.009 0.992857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 763.41 on 870 degrees of freedom
## AIC: 805.41
##
## Number of Fisher Scoring iterations: 15
fitted.prob<-predict(log.model, data=df.train, type='response')
fitted.results<-ifelse(fitted.prob>0.5,1,0)
misClassErr0r<-mean(fitted.results!=df.train$Survived)
misClassErr0r
## [1] 0.1818182
#confusion matrix
table(df.train$Survived, fitted.prob>0.5 )
##
## FALSE TRUE
## 0 482 67
## 1 95 247
##Adult Example
adult<-read.csv('adult_sal.csv')
head(adult)
## X age type_employer fnlwgt education education_num marital
## 1 1 39 State-gov 77516 Bachelors 13 Never-married
## 2 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 3 38 Private 215646 HS-grad 9 Divorced
## 4 4 53 Private 234721 11th 7 Married-civ-spouse
## 5 5 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 6 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital_gain capital_loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hr_per_week country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
##Data Cleaning/ feature engineering
#Combine employer type
unemp<-function(job){
job<-as.character(job)
if(job=='Never-worked'|job=='Without-pay') {
return('Unemployed')
} else {
return(job)
}
}
####
####
adult$type_employer<-sapply(adult$type_employer, unemp)
table(adult$type_employer)
##
## ? Federal-gov Local-gov Private
## 1836 960 2093 22696
## Self-emp-inc Self-emp-not-inc State-gov Unemployed
## 1116 2541 1298 21
group_emp<-function(job){
job<-as.character(job)
if(job=='Local-gov'|job=='State-gov') {
return('SL-gov')
} else if(job=='Self-emp-inc'|job=='Self-emp-not-inc') {
return('self-emp')
} else {
return(job)
}
}
###
adult$type_employer<-sapply(adult$type_employer,group_emp)
KNN
##we use the class library
str(Caravan)
## 'data.frame': 5822 obs. of 86 variables:
## $ MOSTYPE : num 33 37 37 9 40 23 39 33 33 11 ...
## $ MAANTHUI: num 1 1 1 1 1 1 2 1 1 2 ...
## $ MGEMOMV : num 3 2 2 3 4 2 3 2 2 3 ...
## $ MGEMLEEF: num 2 2 2 3 2 1 2 3 4 3 ...
## $ MOSHOOFD: num 8 8 8 3 10 5 9 8 8 3 ...
## $ MGODRK : num 0 1 0 2 1 0 2 0 0 3 ...
## $ MGODPR : num 5 4 4 3 4 5 2 7 1 5 ...
## $ MGODOV : num 1 1 2 2 1 0 0 0 3 0 ...
## $ MGODGE : num 3 4 4 4 4 5 5 2 6 2 ...
## $ MRELGE : num 7 6 3 5 7 0 7 7 6 7 ...
## $ MRELSA : num 0 2 2 2 1 6 2 2 0 0 ...
## $ MRELOV : num 2 2 4 2 2 3 0 0 3 2 ...
## $ MFALLEEN: num 1 0 4 2 2 3 0 0 3 2 ...
## $ MFGEKIND: num 2 4 4 3 4 5 3 5 3 2 ...
## $ MFWEKIND: num 6 5 2 4 4 2 6 4 3 6 ...
## $ MOPLHOOG: num 1 0 0 3 5 0 0 0 0 0 ...
## $ MOPLMIDD: num 2 5 5 4 4 5 4 3 1 4 ...
## $ MOPLLAAG: num 7 4 4 2 0 4 5 6 8 5 ...
## $ MBERHOOG: num 1 0 0 4 0 2 0 2 1 2 ...
## $ MBERZELF: num 0 0 0 0 5 0 0 0 1 0 ...
## $ MBERBOER: num 1 0 0 0 4 0 0 0 0 0 ...
## $ MBERMIDD: num 2 5 7 3 0 4 4 2 1 3 ...
## $ MBERARBG: num 5 0 0 1 0 2 1 5 8 3 ...
## $ MBERARBO: num 2 4 2 2 0 2 5 2 1 3 ...
## $ MSKA : num 1 0 0 3 9 2 0 2 1 1 ...
## $ MSKB1 : num 1 2 5 2 0 2 1 1 1 2 ...
## $ MSKB2 : num 2 3 0 1 0 2 4 2 0 1 ...
## $ MSKC : num 6 5 4 4 0 4 5 5 8 4 ...
## $ MSKD : num 1 0 0 0 0 2 0 2 1 2 ...
## $ MHHUUR : num 1 2 7 5 4 9 6 0 9 0 ...
## $ MHKOOP : num 8 7 2 4 5 0 3 9 0 9 ...
## $ MAUT1 : num 8 7 7 9 6 5 8 4 5 6 ...
## $ MAUT2 : num 0 1 0 0 2 3 0 4 2 1 ...
## $ MAUT0 : num 1 2 2 0 1 3 1 2 3 2 ...
## $ MZFONDS : num 8 6 9 7 5 9 9 6 7 6 ...
## $ MZPART : num 1 3 0 2 4 0 0 3 2 3 ...
## $ MINKM30 : num 0 2 4 1 0 5 4 2 7 2 ...
## $ MINK3045: num 4 0 5 5 0 2 3 5 2 3 ...
## $ MINK4575: num 5 5 0 3 9 3 3 3 1 3 ...
## $ MINK7512: num 0 2 0 0 0 0 0 0 0 1 ...
## $ MINK123M: num 0 0 0 0 0 0 0 0 0 0 ...
## $ MINKGEM : num 4 5 3 4 6 3 3 3 2 4 ...
## $ MKOOPKLA: num 3 4 4 4 3 3 5 3 3 7 ...
## $ PWAPART : num 0 2 2 0 0 0 0 0 0 2 ...
## $ PWABEDR : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PWALAND : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PPERSAUT: num 6 0 6 6 0 6 6 0 5 0 ...
## $ PBESAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PMOTSCO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PVRAAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PAANHANG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PTRACTOR: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PWERKT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PBROM : num 0 0 0 0 0 0 0 3 0 0 ...
## $ PLEVEN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PPERSONG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PGEZONG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PWAOREG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PBRAND : num 5 2 2 2 6 0 0 0 0 3 ...
## $ PZEILPL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PPLEZIER: num 0 0 0 0 0 0 0 0 0 0 ...
## $ PFIETS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PINBOED : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PBYSTAND: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWAPART : num 0 2 1 0 0 0 0 0 0 1 ...
## $ AWABEDR : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWALAND : num 0 0 0 0 0 0 0 0 0 0 ...
## $ APERSAUT: num 1 0 1 1 0 1 1 0 1 0 ...
## $ ABESAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AMOTSCO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AVRAAUT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AAANHANG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ ATRACTOR: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWERKT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ABROM : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ALEVEN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ APERSONG: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AGEZONG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AWAOREG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ABRAND : num 1 1 1 1 1 0 0 0 0 1 ...
## $ AZEILPL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ APLEZIER: num 0 0 0 0 0 0 0 0 0 0 ...
## $ AFIETS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AINBOED : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ABYSTAND: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Purchase: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
any(is.na(Caravan))
## [1] FALSE
purchase<-Caravan[,86]
standardized.Caravan<-scale(Caravan[,-86])
#test
test.index<-1:1000
test.data<-standardized.Caravan[test.index,]
test.purchase<-purchase[test.index]
#train
train.data<-standardized.Caravan[-test.index,]
train.purchase<-purchase[-test.index]
##KNN Model
predicted.purchase<-knn(train.data, test.data, train.purchase, k=1 )
misClassErr0r<-mean(test.purchase!=predicted.purchase)
misClassErr0r
## [1] 0.116
###Choosing a K-VALUE
predicted.purchase<-knn(train.data, test.data, train.purchase, k=3 )
misClassErr0r<-mean(test.purchase!=predicted.purchase)
misClassErr0r
## [1] 0.074
predicted.purchase<-knn(train.data, test.data, train.purchase, k=5 )
misClassErr0r<-mean(test.purchase!=predicted.purchase)
misClassErr0r
## [1] 0.066
predicted.purchase<-NULL
error.rate<-NULL
for (i in 1:20) {
predicted.purchase<-knn(train.data, test.data, train.purchase, k=i )
error.rate[i]<-mean(test.purchase!=predicted.purchase)
}
error.rate
## [1] 0.114 0.113 0.074 0.071 0.066 0.066 0.062 0.060 0.058 0.059 0.059
## [12] 0.059 0.059 0.059 0.059 0.059 0.059 0.059 0.059 0.059
##Visualize k elbow method
k.values<-1:20
error.df<-data.frame(error.rate, k.values)
ggplot(error.df, aes(k.values, error.rate))+geom_point()+geom_line(lty='dotted', color='red')

##KNN on iris dataset
stand_features<-scale(iris[, 1:4])
final_data<-cbind(stand_features, iris[5])
### Train Test Split
set.seed((101))
sample<-sample.split(final_data$Species, SplitRati=0.7)
train<-subset(final_data, sample==T)
test<-subset(final_data, sample==F)
predicted_species<-knn(train[1:4], test[1:4], train$Species, k=1)
predicted_species
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor virginica versicolor versicolor versicolor
## [25] versicolor versicolor virginica versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
mean(test$Species!=predicted_species)
## [1] 0.04444444
predicted.species<-NULL
error.rate<-NULL
for (i in 1:10) {
predicted.species<-knn(train[1:4], test[1:4], train$Species, k=i )
error.rate[i]<-mean(test$Species!=predicted.species)
}
error.rate
## [1] 0.04444444 0.06666667 0.02222222 0.02222222 0.02222222 0.04444444
## [7] 0.04444444 0.04444444 0.04444444 0.02222222
k.values<-1:10
error.df<-data.frame(error.rate, k.values)
ggplot(error.df, aes(k.values, error.rate))+geom_point()+geom_line(lty='dotted', color='red')

Decision Tree and Random Forest
##library(rpart); library(rpart.plot)
str(kyphosis)
## 'data.frame': 81 obs. of 4 variables:
## $ Kyphosis: Factor w/ 2 levels "absent","present": 1 1 2 1 1 1 1 1 1 2 ...
## $ Age : int 71 158 128 2 1 1 61 37 113 59 ...
## $ Number : int 3 3 4 5 4 2 2 3 2 6 ...
## $ Start : int 5 14 5 1 15 16 17 16 16 12 ...
head(kyphosis)
## Kyphosis Age Number Start
## 1 absent 71 3 5
## 2 absent 158 3 14
## 3 present 128 4 5
## 4 absent 2 5 1
## 5 absent 1 4 15
## 6 absent 1 2 16
tree<-rpart(Kyphosis~., method='class', data=kyphosis)
#printcp(tree)
#plotcp(tree)
#rsq.rpart(tree)
#print(tree)
#summary(tree)
#plot(tree)
#text(tree)
#post(tree)
printcp(tree)
##
## Classification tree:
## rpart(formula = Kyphosis ~ ., data = kyphosis, method = "class")
##
## Variables actually used in tree construction:
## [1] Age Start
##
## Root node error: 17/81 = 0.20988
##
## n= 81
##
## CP nsplit rel error xerror xstd
## 1 0.176471 0 1.00000 1.0000 0.21559
## 2 0.019608 1 0.82353 1.0588 0.22010
## 3 0.010000 4 0.76471 1.0588 0.22010
plot(tree, uniform=T, main='Kyphosis Tree')
text(tree, use.n=T, all=T)

prp(tree)

###Random Forest
rf.model<-randomForest(Kyphosis~., data=kyphosis)
print(rf.model)
##
## Call:
## randomForest(formula = Kyphosis ~ ., data = kyphosis)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 20.99%
## Confusion matrix:
## absent present class.error
## absent 60 4 0.0625000
## present 13 4 0.7647059
head(rf.model$predicted)
## 1 2 3 4 5 6
## present absent present absent absent absent
## Levels: absent present
rf.model$ntree
## [1] 500
##Project in College data
df<-College
ggplot(df, aes(Room.Board, Grad.Rate))+geom_point(aes(color=Private), size=2, alpha=0.4)

ggplot(df, aes(F.Undergrad))+geom_histogram(aes(fill=Private), color='black', bins=50)+theme_bw()

ggplot(df, aes(Grad.Rate))+geom_histogram(aes(fill=Private), color='black', bins=50)+theme_bw()

df['Cazenovia College', 'Grad.Rate']<-100
sample<-sample.split(df$Private, SplitRatio = 0.7)
train<-subset(df, sample==T)
test<-subset(df, sample==F)
tree<-rpart(Private~., method='class', data=train)
tree_preds<-predict(tree, test)
tree_preds_ii<-predict(tree, test, type='class')
head(tree_preds)
## No Yes
## Adrian College 0.006389776 0.99361022
## Albion College 0.006389776 0.99361022
## Alma College 0.006389776 0.99361022
## Arizona State University Main campus 0.962616822 0.03738318
## Augustana College 0.006389776 0.99361022
## Averett College 0.006389776 0.99361022
head(tree_preds_ii)
## Adrian College Albion College
## Yes Yes
## Alma College Arizona State University Main campus
## Yes No
## Augustana College Averett College
## Yes Yes
## Levels: No Yes
prp(tree)

##Random Forest
rf.model<-randomForest(Private~., data=train, importance=TRUE)
rf.model$confusion
## No Yes class.error
## No 128 20 0.13513514
## Yes 8 388 0.02020202
rf.model$importance
## No Yes MeanDecreaseAccuracy
## Apps 0.0226552152 1.515865e-02 0.0171944485
## Accept 0.0270621202 1.460876e-02 0.0177891556
## Enroll 0.0500211210 3.658328e-02 0.0402308707
## Top10perc 0.0109724105 5.355321e-03 0.0068637579
## Top25perc 0.0066073620 3.408238e-03 0.0042275130
## F.Undergrad 0.1455889026 6.798197e-02 0.0885207222
## P.Undergrad 0.0480892495 5.540569e-03 0.0171310932
## Outstate 0.1409716025 5.407739e-02 0.0775064866
## Room.Board 0.0172985234 1.875991e-02 0.0182662819
## Books -0.0007335231 8.187665e-05 -0.0001416182
## Personal 0.0035158166 1.096868e-03 0.0017141120
## PhD 0.0113309079 7.659406e-03 0.0086020779
## Terminal 0.0076668023 5.099660e-03 0.0057711953
## S.F.Ratio 0.0535742738 9.558044e-03 0.0215089669
## perc.alumni 0.0207132876 3.019087e-03 0.0078639402
## Expend 0.0228578681 1.125143e-02 0.0143516966
## Grad.Rate 0.0160787996 3.998308e-03 0.0072360887
## MeanDecreaseGini
## Apps 9.646598
## Accept 11.528359
## Enroll 27.187132
## Top10perc 4.953006
## Top25perc 4.164820
## F.Undergrad 44.613182
## P.Undergrad 13.643002
## Outstate 36.334580
## Room.Board 12.629637
## Books 1.484925
## Personal 3.456073
## PhD 4.358535
## Terminal 3.138481
## S.F.Ratio 19.325281
## perc.alumni 4.676900
## Expend 9.016619
## Grad.Rate 5.329973
rf_preds<-predict(rf.model, test)
table(rf_preds, test$Private)
##
## rf_preds No Yes
## No 53 8
## Yes 11 161
SVM
model<-svm(Species~., data=iris)
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = iris)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 51
##
## ( 8 22 21 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
pred_values<-predict(model, iris[1:4])
table(pred_values, iris$Species)
##
## pred_values setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
###Tune Results
tune_results<-tune(svm, train.x = iris[1:4], train.y=iris[,5], kernel='radial', ranges=list(cost=c(0.1,1,10), gamma=c(0.5,1,2)))
summary(tune_results)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.5
##
## - best performance: 0.04
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.5 0.06000000 0.04919099
## 2 1.0 0.5 0.04000000 0.04661373
## 3 10.0 0.5 0.04666667 0.05488484
## 4 0.1 1.0 0.06666667 0.06285394
## 5 1.0 1.0 0.05333333 0.05258738
## 6 10.0 1.0 0.06000000 0.05837300
## 7 0.1 2.0 0.09333333 0.06440612
## 8 1.0 2.0 0.04666667 0.05488484
## 9 10.0 2.0 0.05333333 0.06126244
tuned_svm<-svm(Species~., data=iris, kernel='radial', cost=1.5, gamma=0.1)
summary(tuned_svm)
##
## Call:
## svm(formula = Species ~ ., data = iris, kernel = "radial", cost = 1.5,
## gamma = 0.1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1.5
## gamma: 0.1
##
## Number of Support Vectors: 50
##
## ( 4 23 23 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
pred_values<-predict(tuned_svm, iris[1:4])
table(pred_values, iris$Species)
##
## pred_values setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49
### SVM Project
loans<-read.csv('loan_data.csv')
str(loans)
## 'data.frame': 9578 obs. of 14 variables:
## $ credit.policy : int 1 1 1 1 1 1 1 1 1 1 ...
## $ purpose : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : num 19.5 14.3 11.6 8.1 15 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : num 52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : int 0 0 0 0 1 0 0 0 0 0 ...
## $ pub.rec : int 0 0 0 0 0 0 1 0 0 0 ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
##Convert to factors
loans$credit.policy<-factor(loans$credit.policy)
loans$inq.last.6mths<-factor(loans$inq.last.6mths)
loans$delinq.2yrs<-factor(loans$delinq.2yrs)
loans$pub.rec<-factor(loans$pub.rec)
loans$not.fully.paid<-factor(loans$not.fully.paid)
pl<-ggplot(loans, aes(fico))+geom_histogram(aes(fill=not.fully.paid), color='black', bins = 40, alpha=0.5)+theme_bw()
pl

pl+scale_fill_manual(values=c('green', 'red'))

ggplot(loans, aes(x=factor(purpose)))+geom_bar(aes(fill=not.fully.paid), position='dodge')

ggplot(loans, aes(int.rate, fico))+geom_point(aes(color=not.fully.paid), alpha=0.4)

###TRAIN TEST SPLIT
sample<-sample.split(loans$not.fully.paid, 0.7)
train<-subset(loans, sample==T)
test<-subset(loans, sample==F)
model<-svm(not.fully.paid~., data=train)
summary(model)
##
## Call:
## svm(formula = not.fully.paid ~ ., data = train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01724138
##
## Number of Support Vectors: 2822
##
## ( 1749 1073 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
predicted_values<-predict(model, test[1:13])
table(predicted_values, test$not.fully.paid)
##
## predicted_values 0 1
## 0 2413 460
## 1 0 0
tuned.results<-tune(svm, train.x=not.fully.paid~., data=train, kernel='radial', ranges=list(cost=c(100,200), gamma=c(0.1)))
summary(tuned.results)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 100 0.1
##
## - best performance: 0.2147648
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 100 0.1 0.2147648 0.01688813
## 2 200 0.1 0.2266982 0.01884881
tuned_model<-svm(not.fully.paid~., data=train, cost=100, gamma=0.1)
tuned.predictions<-predict(tuned_model, test[1:13])
table(tuned.predictions, test$not.fully.paid)
##
## tuned.predictions 0 1
## 0 2198 384
## 1 215 76
K Means Clustering
ggplot(iris, aes(Petal.Length, Petal.Width, color=Species))+geom_point(size=4)

iriscluster<-kmeans(iris[,1:4], 3, nstart = 20)
print(iriscluster)
## K-means clustering with 3 clusters of sizes 38, 50, 62
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 6.850000 3.073684 5.742105 2.071053
## 2 5.006000 3.428000 1.462000 0.246000
## 3 5.901613 2.748387 4.393548 1.433871
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1
## [106] 1 3 1 1 1 1 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1
## [141] 1 1 3 1 1 1 3 1 1 3
##
## Within cluster sum of squares by cluster:
## [1] 23.87947 15.15100 39.82097
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(iriscluster$cluster, iris$Species)
##
## setosa versicolor virginica
## 1 0 2 36
## 2 50 0 0
## 3 0 48 14
clusplot(iris, iriscluster$cluster, color=T, shade=T, labels = 0, lines=0)

##Project in K means
df1<-read.csv('winequality-red.csv', sep=';')
df2<-read.csv('winequality-white.csv', sep=';')
head(df1)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
df1$label<-c('red')
df2$label<-c('white')
wine<-rbind(df1,df2)
ggplot(wine, aes(residual.sugar))+geom_histogram(aes(fill=label), color='black', bins=50)+scale_fill_manual(values=c('dark red', 'white'))

ggplot(wine, aes(citric.acid))+geom_histogram(aes(fill=label), color='black', bins=50)+scale_fill_manual(values=c('dark red', 'white'))

ggplot(wine, aes(alcohol))+geom_histogram(aes(fill=label), color='black', bins=50)+scale_fill_manual(values=c('dark red', 'white'))

ggplot(wine, aes(citric.acid, residual.sugar))+geom_point(aes(color=label), alpha=0.2)+scale_color_manual(values=c('dark red', 'white'))+theme_dark()

ggplot(wine, aes(volatile.acidity, residual.sugar))+geom_point(aes(color=label), alpha=0.2)+scale_color_manual(values=c('dark red', 'white'))+theme_dark()

clusdata<-wine[,1:12]
winecluster<-kmeans(clusdata, 2)
winecluster$centers
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 6.904698 0.2871364 0.3398094 7.259286 0.0486092
## 2 7.619044 0.4079451 0.2911080 3.082690 0.0656846
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 1 39.82503 155.90101 0.9947956 3.190308 0.5000354
## 2 18.43735 63.54832 0.9945680 3.255147 0.5718655
## alcohol quality
## 1 10.25832 5.825436
## 2 10.79529 5.809204
table(wine$labe, winecluster$cluster)
##
## 1 2
## red 84 1515
## white 3588 1310
NLP
#Connect to twitter
ckey=c('WJz1DjXp8ZYwcBMy4cbiTS9mX')
skey=c('27sGwCB9qtEgnSaLv4Z5hZKjuX9PyUkr774CMVx016cmHOMrlM')
token=c('313313903-nZbOUAuHg4B8HNhuQBPPnXJ1oxHisezKT8g9OtvS')
sectoken=c('oVMp0Xn1yIr1nirqHKD18omFAHELIe9ocv3Jw9WWwfGCp')
setup_twitter_oauth(ckey, skey, token, sectoken)
## [1] "Using direct authentication"
#Search for soccer
soccer_tweeks<-searchTwitter('soccer', n=1000, lang='en')
soccer_text<-sapply(soccer_tweeks, function(x) x$getText())
#Clean Data
soccer_text<-iconv(soccer_text, 'UTF-8', 'ASCII')
soccer_corpus<-Corpus(VectorSource(soccer_text))
#Document Term Matrix
term_doc_metrix<-TermDocumentMatrix(soccer_corpus,
control = list(removePunctuation = TRUE,
stopwords = c("soccer","http", stopwords("english")),
removeNumbers = TRUE,tolower = TRUE))
#AS MATRIX
term.doc.matrix <- as.matrix(term_doc_metrix)
#Get words counts
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE)
dm <- data.frame(word=names(word.freqs), freq=word.freqs)
wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#Search for python
soccer_tweeks<-searchTwitter('python', n=1000, lang='en')
soccer_text<-sapply(soccer_tweeks, function(x) x$getText())
#Clean Data
soccer_text<-iconv(soccer_text, 'UTF-8', 'ASCII')
soccer_corpus<-Corpus(VectorSource(soccer_text))
#Document Term Matrix
term_doc_metrix<-TermDocumentMatrix(soccer_corpus,
control = list(removePunctuation = TRUE,
stopwords = c("python","http", stopwords("english")),
removeNumbers = TRUE,tolower = TRUE))
#AS MATRIX
term.doc.matrix <- as.matrix(term_doc_metrix)
#Get words counts
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE)
dm <- data.frame(word=names(word.freqs), freq=word.freqs)
wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

Neural Networks
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
data<-Boston
#Normalize our data
maxs<-apply(data, 2, max)
mins<-apply(data, 2, min)
scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))
head(scaled)
## crim zn indus chas nox rm age
## 1 0.0000000000 0.18 0.06781525 0 0.3148148 0.5775053 0.6416066
## 2 0.0002359225 0.00 0.24230205 0 0.1728395 0.5479977 0.7826982
## 3 0.0002356977 0.00 0.24230205 0 0.1728395 0.6943859 0.5993821
## 4 0.0002927957 0.00 0.06304985 0 0.1502058 0.6585553 0.4418126
## 5 0.0007050701 0.00 0.06304985 0 0.1502058 0.6871048 0.5283213
## 6 0.0002644715 0.00 0.06304985 0 0.1502058 0.5497222 0.5746653
## dis rad tax ptratio black lstat medv
## 1 0.2692031 0.00000000 0.20801527 0.2872340 1.0000000 0.08967991 0.4222222
## 2 0.3489620 0.04347826 0.10496183 0.5531915 1.0000000 0.20447020 0.3688889
## 3 0.3489620 0.04347826 0.10496183 0.5531915 0.9897373 0.06346578 0.6600000
## 4 0.4485446 0.08695652 0.06679389 0.6489362 0.9942761 0.03338852 0.6311111
## 5 0.4485446 0.08695652 0.06679389 0.6489362 1.0000000 0.09933775 0.6933333
## 6 0.4485446 0.08695652 0.06679389 0.6489362 0.9929901 0.09602649 0.5266667
##Train and Test Sets
split = sample.split(scaled$medv, SplitRatio = 0.70)
train = subset(scaled, split == TRUE)
test = subset(scaled, split == FALSE)
#For some odd reasons, the neuralnet() function won't accept a formula in the form: y~. that we are used to using. Instead you have to call all the columns added together. Here is some #convience code to help quickly create that formula:
# Get column names
n <- names(train)
# Paste together
f <- as.formula(paste("medv ~", paste(n[!n %in% "medv"], collapse = " + ")))
##Becasue it is continuous and not classification we set linear.output=TRUE
nn <- neuralnet(f,data=train,hidden=c(5,3),linear.output=TRUE)
plot(nn)
###Predictions
# Compute Predictions off Test Set
predicted.nn.values <- compute(nn,test[1:13])
# Convert back to non-scaled predictions
true.predictions <- predicted.nn.values$net.result*(max(data$medv)-min(data$medv))+min(data$medv)
# Convert the test data
test.r <- (test$medv)*(max(data$medv)-min(data$medv))+min(data$medv)
# Check the Mean Squared Error
MSE.nn <- sum((test.r - true.predictions)^2)/nrow(test)
MSE.nn
## [1] 13.37068543
#Visualize Error
error.df <- data.frame(test.r,true.predictions)
ggplot(error.df,aes(x=test.r,y=true.predictions)) + geom_point() + stat_smooth()
########
###Neural Netwrok Project
df <- read.csv('bank_note_data.csv')
head(df)
## Image.Var Image.Skew Image.Curt Entropy Class
## 1 3.62160 8.6661 -2.8073 -0.44699 0
## 2 4.54590 8.1674 -2.4586 -1.46210 0
## 3 3.86600 -2.6383 1.9242 0.10645 0
## 4 3.45660 9.5228 -4.0112 -3.59440 0
## 5 0.32924 -4.4552 4.5718 -0.98880 0
## 6 4.36840 9.6718 -3.9606 -3.16250 0
#TRAIN AND TEST SPLIT
set.seed(101)
split = sample.split(df$Class, SplitRatio = 0.70)
train = subset(df, split == TRUE)
test = subset(df, split == FALSE)
nn <- neuralnet(Class ~ Image.Var + Image.Skew + Image.Curt + Entropy,data=train,hidden=10,linear.output=FALSE)
predicted.nn.values <- compute(nn,test[,1:4])
head(predicted.nn.values$net.result)
## [,1]
## 3 0.000015994811962
## 11 0.000037138515521
## 12 0.000006944038291
## 13 0.000001799145605
## 14 0.000003469964563
## 17 0.000002957116883
#Apply the round function to the predicted values so you only 0s and 1s as your predicted classes.
predictions <- sapply(predicted.nn.values$net.result,round)
head(predictions)
## [1] 0 0 0 0 0 0
table(predictions,test$Class)
##
## predictions 0 1
## 0 229 0
## 1 0 183
##You should have noticed that you did very well! Almost suspiciously well! Let's check our results against a randomForest model!
##Comparing Models
df$Class <- factor(df$Class)
set.seed(101)
split = sample.split(df$Class, SplitRatio = 0.70)
train = subset(df, split == TRUE)
test = subset(df, split == FALSE)
model <- randomForest(Class ~ Image.Var + Image.Skew + Image.Curt + Entropy,data=train)
rf.pred <- predict(model,test)
table(rf.pred,test$Class)
##
## rf.pred 0 1
## 0 227 1
## 1 2 182
#Use predict() to get the predicted values from your rf model.
rf.pred <- predict(model,test)