library(ISLR)
library(tidyr)
library(fastDummies)
library(boot)
setwd("/Users/jianingjin/Desktop/IEMS_304/lab2/")
load("p4_AdData.RData")
#Q1
#original data processing
head(dataTrainAll)
#step1: transfer to dataframe
DataTrainingSet = as.data.frame(dataTrainAll)
#step 2: select features
DataTrainingSet = DataTrainingSet[c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form', 'Ad_Width', 'Ad_Height', 'Floor_Price', 'Click')] 
#step 3: transfer categorical data to dummy variables
DataTrainingSet = dummy_cols(DataTrainingSet, c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form'), remove_first_dummy = TRUE) #k - 1 dummies
#delete original categorical data
delete = c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form')
DataTrainingSet <- DataTrainingSet[,!(names(DataTrainingSet) %in% delete)]
head(DataTrainingSet)
#step 4: standardize numerical data: transfer to z-score [(xi-x.bar)/sd]
process = scale(DataTrainingSet[, c("Ad_Width", "Ad_Height", "Floor_Price")])
head(process)
##        Ad_Width  Ad_Height Floor_Price
## [1,]  0.9635290  0.3312922  -0.3132385
## [2,]  1.9635194  0.3312922   0.2422080
## [3,] -0.9776289  0.3312922   1.0753778
## [4,] -0.9776289 -1.0242751  -0.5909618
## [5,] -0.6099853  0.3312922  -0.5909618
## [6,] -0.6099853 -1.0242751  -0.3132385
#delete & bind data
#turn factor; convert everything to factor first
Click = as.factor(
ifelse(DataTrainingSet$Click > 0, 1, 0))
dt1 = cbind(Click, DataTrainingSet)
dt2 = cbind(process, dt1)
dt3 = dt2[ , -(5:8)]
#final dataset
head(dt3)
#part a
#fit the model with all features
glm1 = glm(Click ~., data =  dt3, family = "binomial")
summary(glm1)
## 
## Call:
## glm(formula = Click ~ ., family = "binomial", data = dt3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1699  -0.0605  -0.0108  -0.0037   4.5123  
## 
## Coefficients: (2 not defined because of singularities)
##                                            Estimate Std. Error z value Pr(>|z|)
## (Intercept)                               -8.898243   0.483507 -18.404  < 2e-16
## Ad_Width                                   3.412639   0.195381  17.467  < 2e-16
## Ad_Height                                  1.871926   0.098969  18.914  < 2e-16
## Floor_Price                               -0.007723   0.071290  -0.108  0.91373
## Region_3                                   0.412784   0.314360   1.313  0.18915
## Region_6                                   0.486677   0.219396   2.218  0.02654
## City_2                                     0.606609   0.275518   2.202  0.02769
## City_3                                    -0.495053   1.074131  -0.461  0.64488
## City_4                                     0.245179   0.341285   0.718  0.47251
## City_5                                           NA         NA      NA       NA
## City_6                                           NA         NA      NA       NA
## AdX_2                                      0.000774   0.172727   0.004  0.99642
## AdX_3                                     -0.048319   0.186479  -0.259  0.79555
## Domain_5KFUl5p0Gxsvgmd4wspENpn            -0.508970   0.795444  -0.640  0.52227
## Domain_trqRTu5Jg9q9wMKYvmpENpn             0.127281   0.201414   0.632  0.52743
## Domain_trqRTudNXqN8ggc4JKTI                0.331080   0.184370   1.796  0.07254
## `Domain_trqRTuT-GNTYJNKbuKz`              -1.616900   0.493389  -3.277  0.00105
## Key_Page_9f4e2f16b6873a7eb504df6f61b24044 -0.356007   0.185065  -1.924  0.05439
## Key_Page_df6f61b2409f4e2f16b6873a7eb50444  0.201390   0.172360   1.168  0.24264
## Ad_Vis_1                                   0.229510   0.193073   1.189  0.23455
## Ad_Vis_2                                   0.168321   0.164231   1.025  0.30541
## Ad_Form_1                                 -0.113022   0.156915  -0.720  0.47136
##                                              
## (Intercept)                               ***
## Ad_Width                                  ***
## Ad_Height                                 ***
## Floor_Price                                  
## Region_3                                     
## Region_6                                  *  
## City_2                                    *  
## City_3                                       
## City_4                                       
## City_5                                       
## City_6                                       
## AdX_2                                        
## AdX_3                                        
## Domain_5KFUl5p0Gxsvgmd4wspENpn               
## Domain_trqRTu5Jg9q9wMKYvmpENpn               
## Domain_trqRTudNXqN8ggc4JKTI               .  
## `Domain_trqRTuT-GNTYJNKbuKz`              ** 
## Key_Page_9f4e2f16b6873a7eb504df6f61b24044 .  
## Key_Page_df6f61b2409f4e2f16b6873a7eb50444    
## Ad_Vis_1                                     
## Ad_Vis_2                                     
## Ad_Form_1                                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3178.8  on 9999  degrees of freedom
## Residual deviance: 1314.1  on 9980  degrees of freedom
## AIC: 1354.1
## 
## Number of Fisher Scoring iterations: 10
#part b
#according to the results, we could see that Ad_Width and Ad_Height play the most significant role with the least p-value. Therefore, we could reject the null hypothesis that they are not correlated. Also, they all have large coefficients: 3.41 and 1.87 respectively.
#part c
set.seed(1)
#1-feature model
glm.1f = glm(Click~Ad_Height, data = dt3, family = binomial)
cv.1f = cv.glm(dt3, glm.1f, K = 10)
error1 = cv.1f$delta
error1
## [1] 0.03263211 0.03263121
#5-feature model
glm.5f = glm(Click~Ad_Width + Ad_Height + `Domain_trqRTuT-GNTYJNKbuKz`  + Region_6 + City_2, data = dt3, family = binomial)
cv.5f = cv.glm(dt3, glm.5f, K = 10)
error2 = cv.5f$delta
error2
## [1] 0.02009752 0.02009307
#10 feature model
glm.10f = glm(Click~Ad_Width + Ad_Height + `Domain_trqRTuT-GNTYJNKbuKz`  + Region_6 + City_2 + Key_Page_9f4e2f16b6873a7eb504df6f61b24044 + Domain_trqRTudNXqN8ggc4JKTI + Region_3 + Key_Page_df6f61b2409f4e2f16b6873a7eb50444 + Ad_Vis_1, data = dt3, family = binomial)
cv.10f = cv.glm(dt3, glm.10f, K = 10)
error3 = cv.10f$delta
error3
## [1] 0.01992954 0.01991467
#15 feature model
glm.15f = glm(Click~Ad_Width + Ad_Height + `Domain_trqRTuT-GNTYJNKbuKz`  + Region_6 + City_2 + Key_Page_9f4e2f16b6873a7eb504df6f61b24044 + Domain_trqRTudNXqN8ggc4JKTI + Region_3 + Key_Page_df6f61b2409f4e2f16b6873a7eb50444 + Ad_Vis_1 + Ad_Vis_2 + Ad_Form_1 + City_4 + Domain_5KFUl5p0Gxsvgmd4wspENpn + Domain_trqRTu5Jg9q9wMKYvmpENpn, data = dt3, family = binomial)
cv.15f = cv.glm(dt3, glm.15f, K = 10)
error4 = cv.15f$delta
error4
## [1] 0.02018861 0.02015957
#plot estimated testing error
cv.21f = cv.glm(dt3, glm1, K = 10)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
error5 = cv.21f$delta
x = c(1, 5, 10, 15, 21)
unadjuested.testing.error = c(error1[1], error2[1], error3[1], error4[1], error5[1])
plot(x, unadjuested.testing.error, type = "b", col = "black", xlab = "Number of Features", ylab = "Estimated Testing Error", main = "Unadjuested Estimated Testing Error")

adjuested.testing.error = c(error1[2], error2[2], error3[2], error4[2], error5[2])
plot(x, adjuested.testing.error, type = "b", col = "orange", xlab = "Number of Features", ylab = "Estimated Testing Error", main = "Adjuested Estimated Testing Error")

#because in the first try, I alter the data using the original dataset (without rename), therefore, when I change the dataset name and start my second try, some original features in the original dataset are changed forever. But the first try is not going wrong. Therefore, the following codes just can not work well but the outcome is right. 
#WARNING! Rename when changing original dataset!!!
#part d
#step 1: transfer to dataframe
dataTest = data.frame(dataTest)
#step 2: select features
dataTest = dataTest[c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form', 'Ad_Width', 'Ad_Height', 'Floor_Price')] 
#step 3: transfer categorical data to dummy variables
dataTest = dummy_cols(dataTest, c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form'), remove_first_dummy = TRUE) #k - 1 dummies
#delete original categorical data
delete = c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form')
dataTest = dataTest[,!(names(dataTest) %in% delete)]
head(dataTest)
#step 4: standardize numerical data: transfer to z-score [(xi-x.bar)/sd]
process.test = scale(dataTest[, c("Ad_Width", "Ad_Height", "Floor_Price")])
head(process.test)
##        Ad_Width  Ad_Height Floor_Price
## [1,]  0.9850930 -1.0200269  0.31708644
## [2,] -0.6028415 -1.0200269 -0.02784768
## [3,] -0.6028415  0.3723350 -0.31529278
## [4,] -0.7883479  0.3723350  2.27171311
## [5,] -0.7883479  0.3723350 -0.60273788
## [6,] -0.6028415  0.6334028 -0.02784768
#delete & bind data
dt.bind1 = cbind(process.test, dataTest)
head(dt.bind1)
testing.data = dt.bind1[ , -(4:6)]
#final dataset
head(testing.data)
#True Clicks (in dataTestRes)
TrueClicks = ifelse(dataTestRes$Click > 0, 1, 0)
test.predict10 = predict(glm.10f, testing.data, type = "response")
test.predict10.result = ifelse(test.predict10 > 0.5, 1, 0)
table(test.predict10.result, TrueClicks)
##                      TrueClicks
## test.predict10.result    0    1
##                     0 9651  230
##                     1   27   92
#misclassification rate
(27+230)/(9651+230+27+92)
## [1] 0.0257
#false positive
27/(27+9651)
## [1] 0.002789833
#false negative
230/(230+92)
## [1] 0.7142857
#result: the false negative rate is very high and the threshold should be lower.
#Q2
#part a
#Standardize numerical data
trainingset = scale(dataTrainAll[, c("AdX", "iPinYou_Bid", "Comp_Bid")])
head(trainingset)
##          AdX iPinYou_Bid   Comp_Bid
## 1  0.0480746  -1.0502765 -1.0245268
## 2  1.3198895  -0.9118871 -0.8622491
## 3  1.3198895  -0.7043028 -0.2942775
## 4  0.0480746  -0.6351081 -0.4565551
## 5 -1.2237403   2.7554343  3.3569689
## 6  0.0480746  -0.8426923 -1.0245268
#construct linear model
trainingset = as.data.frame(trainingset)
lm1 = lm(Comp_Bid~AdX+iPinYou_Bid, data = trainingset)
summary(lm1)
## 
## Call:
## lm(formula = Comp_Bid ~ AdX + iPinYou_Bid, data = trainingset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.79794 -0.15590  0.04985  0.28925  2.26254 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.819e-15  5.285e-03    0.00        1    
## AdX         -1.664e-01  5.705e-03  -29.18   <2e-16 ***
## iPinYou_Bid  7.722e-01  5.705e-03  135.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5285 on 9997 degrees of freedom
## Multiple R-squared:  0.7208, Adjusted R-squared:  0.7207 
## F-statistic: 1.29e+04 on 2 and 9997 DF,  p-value: < 2.2e-16
#part b
#iPinYou_Bid seems to be more important than AdX, because it's larger coefficient and absolute t-value. But both two features and significant enough to reject the null hypothesis.
#part c
glm.linear = glm(Comp_Bid~AdX+iPinYou_Bid, data = trainingset)
cv.lm1 = cv.glm(trainingset, glm.linear, K = 10)
cv.lm1$delta
## [1] 0.2796781 0.2796540