library(ISLR)
library(tidyr)
library(fastDummies)
library(boot)
setwd("/Users/jianingjin/Desktop/IEMS_304/lab2/")
load("p4_AdData.RData")
#Q1
#original data processing
head(dataTrainAll)
#step1: transfer to dataframe
DataTrainingSet = as.data.frame(dataTrainAll)
#step 2: select features
DataTrainingSet = DataTrainingSet[c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form', 'Ad_Width', 'Ad_Height', 'Floor_Price', 'Click')]
#step 3: transfer categorical data to dummy variables
DataTrainingSet = dummy_cols(DataTrainingSet, c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form'), remove_first_dummy = TRUE) #k - 1 dummies
#delete original categorical data
delete = c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form')
DataTrainingSet <- DataTrainingSet[,!(names(DataTrainingSet) %in% delete)]
head(DataTrainingSet)
#step 4: standardize numerical data: transfer to z-score [(xi-x.bar)/sd]
process = scale(DataTrainingSet[, c("Ad_Width", "Ad_Height", "Floor_Price")])
head(process)
## Ad_Width Ad_Height Floor_Price
## [1,] 0.9635290 0.3312922 -0.3132385
## [2,] 1.9635194 0.3312922 0.2422080
## [3,] -0.9776289 0.3312922 1.0753778
## [4,] -0.9776289 -1.0242751 -0.5909618
## [5,] -0.6099853 0.3312922 -0.5909618
## [6,] -0.6099853 -1.0242751 -0.3132385
#delete & bind data
#turn factor; convert everything to factor first
Click = as.factor(
ifelse(DataTrainingSet$Click > 0, 1, 0))
dt1 = cbind(Click, DataTrainingSet)
dt2 = cbind(process, dt1)
dt3 = dt2[ , -(5:8)]
#final dataset
head(dt3)
#part a
#fit the model with all features
glm1 = glm(Click ~., data = dt3, family = "binomial")
summary(glm1)
##
## Call:
## glm(formula = Click ~ ., family = "binomial", data = dt3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1699 -0.0605 -0.0108 -0.0037 4.5123
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.898243 0.483507 -18.404 < 2e-16
## Ad_Width 3.412639 0.195381 17.467 < 2e-16
## Ad_Height 1.871926 0.098969 18.914 < 2e-16
## Floor_Price -0.007723 0.071290 -0.108 0.91373
## Region_3 0.412784 0.314360 1.313 0.18915
## Region_6 0.486677 0.219396 2.218 0.02654
## City_2 0.606609 0.275518 2.202 0.02769
## City_3 -0.495053 1.074131 -0.461 0.64488
## City_4 0.245179 0.341285 0.718 0.47251
## City_5 NA NA NA NA
## City_6 NA NA NA NA
## AdX_2 0.000774 0.172727 0.004 0.99642
## AdX_3 -0.048319 0.186479 -0.259 0.79555
## Domain_5KFUl5p0Gxsvgmd4wspENpn -0.508970 0.795444 -0.640 0.52227
## Domain_trqRTu5Jg9q9wMKYvmpENpn 0.127281 0.201414 0.632 0.52743
## Domain_trqRTudNXqN8ggc4JKTI 0.331080 0.184370 1.796 0.07254
## `Domain_trqRTuT-GNTYJNKbuKz` -1.616900 0.493389 -3.277 0.00105
## Key_Page_9f4e2f16b6873a7eb504df6f61b24044 -0.356007 0.185065 -1.924 0.05439
## Key_Page_df6f61b2409f4e2f16b6873a7eb50444 0.201390 0.172360 1.168 0.24264
## Ad_Vis_1 0.229510 0.193073 1.189 0.23455
## Ad_Vis_2 0.168321 0.164231 1.025 0.30541
## Ad_Form_1 -0.113022 0.156915 -0.720 0.47136
##
## (Intercept) ***
## Ad_Width ***
## Ad_Height ***
## Floor_Price
## Region_3
## Region_6 *
## City_2 *
## City_3
## City_4
## City_5
## City_6
## AdX_2
## AdX_3
## Domain_5KFUl5p0Gxsvgmd4wspENpn
## Domain_trqRTu5Jg9q9wMKYvmpENpn
## Domain_trqRTudNXqN8ggc4JKTI .
## `Domain_trqRTuT-GNTYJNKbuKz` **
## Key_Page_9f4e2f16b6873a7eb504df6f61b24044 .
## Key_Page_df6f61b2409f4e2f16b6873a7eb50444
## Ad_Vis_1
## Ad_Vis_2
## Ad_Form_1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3178.8 on 9999 degrees of freedom
## Residual deviance: 1314.1 on 9980 degrees of freedom
## AIC: 1354.1
##
## Number of Fisher Scoring iterations: 10
#part b
#according to the results, we could see that Ad_Width and Ad_Height play the most significant role with the least p-value. Therefore, we could reject the null hypothesis that they are not correlated. Also, they all have large coefficients: 3.41 and 1.87 respectively.
#part c
set.seed(1)
#1-feature model
glm.1f = glm(Click~Ad_Height, data = dt3, family = binomial)
cv.1f = cv.glm(dt3, glm.1f, K = 10)
error1 = cv.1f$delta
error1
## [1] 0.03263211 0.03263121
#5-feature model
glm.5f = glm(Click~Ad_Width + Ad_Height + `Domain_trqRTuT-GNTYJNKbuKz` + Region_6 + City_2, data = dt3, family = binomial)
cv.5f = cv.glm(dt3, glm.5f, K = 10)
error2 = cv.5f$delta
error2
## [1] 0.02009752 0.02009307
#10 feature model
glm.10f = glm(Click~Ad_Width + Ad_Height + `Domain_trqRTuT-GNTYJNKbuKz` + Region_6 + City_2 + Key_Page_9f4e2f16b6873a7eb504df6f61b24044 + Domain_trqRTudNXqN8ggc4JKTI + Region_3 + Key_Page_df6f61b2409f4e2f16b6873a7eb50444 + Ad_Vis_1, data = dt3, family = binomial)
cv.10f = cv.glm(dt3, glm.10f, K = 10)
error3 = cv.10f$delta
error3
## [1] 0.01992954 0.01991467
#15 feature model
glm.15f = glm(Click~Ad_Width + Ad_Height + `Domain_trqRTuT-GNTYJNKbuKz` + Region_6 + City_2 + Key_Page_9f4e2f16b6873a7eb504df6f61b24044 + Domain_trqRTudNXqN8ggc4JKTI + Region_3 + Key_Page_df6f61b2409f4e2f16b6873a7eb50444 + Ad_Vis_1 + Ad_Vis_2 + Ad_Form_1 + City_4 + Domain_5KFUl5p0Gxsvgmd4wspENpn + Domain_trqRTu5Jg9q9wMKYvmpENpn, data = dt3, family = binomial)
cv.15f = cv.glm(dt3, glm.15f, K = 10)
error4 = cv.15f$delta
error4
## [1] 0.02018861 0.02015957
#plot estimated testing error
cv.21f = cv.glm(dt3, glm1, K = 10)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
error5 = cv.21f$delta
x = c(1, 5, 10, 15, 21)
unadjuested.testing.error = c(error1[1], error2[1], error3[1], error4[1], error5[1])
plot(x, unadjuested.testing.error, type = "b", col = "black", xlab = "Number of Features", ylab = "Estimated Testing Error", main = "Unadjuested Estimated Testing Error")

adjuested.testing.error = c(error1[2], error2[2], error3[2], error4[2], error5[2])
plot(x, adjuested.testing.error, type = "b", col = "orange", xlab = "Number of Features", ylab = "Estimated Testing Error", main = "Adjuested Estimated Testing Error")

#because in the first try, I alter the data using the original dataset (without rename), therefore, when I change the dataset name and start my second try, some original features in the original dataset are changed forever. But the first try is not going wrong. Therefore, the following codes just can not work well but the outcome is right.
#WARNING! Rename when changing original dataset!!!
#part d
#step 1: transfer to dataframe
dataTest = data.frame(dataTest)
#step 2: select features
dataTest = dataTest[c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form', 'Ad_Width', 'Ad_Height', 'Floor_Price')]
#step 3: transfer categorical data to dummy variables
dataTest = dummy_cols(dataTest, c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form'), remove_first_dummy = TRUE) #k - 1 dummies
#delete original categorical data
delete = c('Region', 'City', 'AdX', 'Domain', 'Key_Page', 'Ad_Vis', 'Ad_Form')
dataTest = dataTest[,!(names(dataTest) %in% delete)]
head(dataTest)
#step 4: standardize numerical data: transfer to z-score [(xi-x.bar)/sd]
process.test = scale(dataTest[, c("Ad_Width", "Ad_Height", "Floor_Price")])
head(process.test)
## Ad_Width Ad_Height Floor_Price
## [1,] 0.9850930 -1.0200269 0.31708644
## [2,] -0.6028415 -1.0200269 -0.02784768
## [3,] -0.6028415 0.3723350 -0.31529278
## [4,] -0.7883479 0.3723350 2.27171311
## [5,] -0.7883479 0.3723350 -0.60273788
## [6,] -0.6028415 0.6334028 -0.02784768
#delete & bind data
dt.bind1 = cbind(process.test, dataTest)
head(dt.bind1)
testing.data = dt.bind1[ , -(4:6)]
#final dataset
head(testing.data)
#True Clicks (in dataTestRes)
TrueClicks = ifelse(dataTestRes$Click > 0, 1, 0)
test.predict10 = predict(glm.10f, testing.data, type = "response")
test.predict10.result = ifelse(test.predict10 > 0.5, 1, 0)
table(test.predict10.result, TrueClicks)
## TrueClicks
## test.predict10.result 0 1
## 0 9651 230
## 1 27 92
#misclassification rate
(27+230)/(9651+230+27+92)
## [1] 0.0257
#false positive
27/(27+9651)
## [1] 0.002789833
#false negative
230/(230+92)
## [1] 0.7142857
#result: the false negative rate is very high and the threshold should be lower.
#Q2
#part a
#Standardize numerical data
trainingset = scale(dataTrainAll[, c("AdX", "iPinYou_Bid", "Comp_Bid")])
head(trainingset)
## AdX iPinYou_Bid Comp_Bid
## 1 0.0480746 -1.0502765 -1.0245268
## 2 1.3198895 -0.9118871 -0.8622491
## 3 1.3198895 -0.7043028 -0.2942775
## 4 0.0480746 -0.6351081 -0.4565551
## 5 -1.2237403 2.7554343 3.3569689
## 6 0.0480746 -0.8426923 -1.0245268
#construct linear model
trainingset = as.data.frame(trainingset)
lm1 = lm(Comp_Bid~AdX+iPinYou_Bid, data = trainingset)
summary(lm1)
##
## Call:
## lm(formula = Comp_Bid ~ AdX + iPinYou_Bid, data = trainingset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.79794 -0.15590 0.04985 0.28925 2.26254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.819e-15 5.285e-03 0.00 1
## AdX -1.664e-01 5.705e-03 -29.18 <2e-16 ***
## iPinYou_Bid 7.722e-01 5.705e-03 135.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5285 on 9997 degrees of freedom
## Multiple R-squared: 0.7208, Adjusted R-squared: 0.7207
## F-statistic: 1.29e+04 on 2 and 9997 DF, p-value: < 2.2e-16
#part b
#iPinYou_Bid seems to be more important than AdX, because it's larger coefficient and absolute t-value. But both two features and significant enough to reject the null hypothesis.
#part c
glm.linear = glm(Comp_Bid~AdX+iPinYou_Bid, data = trainingset)
cv.lm1 = cv.glm(trainingset, glm.linear, K = 10)
cv.lm1$delta
## [1] 0.2796781 0.2796540