load.libraries <- c('tidyverse', 'forcats', 'corrplot', 'caret', 'Metrics', 'randomForest', 'xgboost', 'glmnet', 'car')
# note car only for Darya to use the demo dataset
install.lib <- load.libraries[!load.libraries %in% installed.packages()]
for(libs in install.lib) install.packages(libs, dependences = TRUE)
sapply(load.libraries, library, character = TRUE)
knitr::opts_chunk$set(echo = TRUE)
# Arithmetic operations: R is a calculator
1 + 1
## [1] 2
10**2
## [1] 100
TRUE + TRUE
## [1] 2
Gotcha: R is not python (works only for numeric):
#"1" + "1"
Variables:
x <- 1 # the R "way" to do it ...
y = 2
print(x)
## [1] 1
print(y)
## [1] 2
Data types: vectors
a <- c(1,2,5.3,6,-2,4) # numeric vector
b <- c("one","two","three") # character vector
c <- c(TRUE,TRUE,TRUE,FALSE,TRUE,FALSE) #logical vector
print(paste(c(class(a), class(b), class(c))))
## [1] "numeric" "character" "logical"
Getting help and the combine function:
# getting help
?c
x <- c(1,2,3)
class(x)
y <- c(x, "2")
class(y)
Load the data in from csv.
# read.csv("file")
myprestige <- Prestige
myprestige$job <- row.names(myprestige)
?Prestige
myprestige
head(myprestige)
Use the summary() and str() functions to explore…
summary(myprestige)
str(myprestige)
table(myprestige$type)
myprestige %>% ggplot(aes(x = income)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
myprestige %>% ggplot(aes(x = income)) + geom_histogram() + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
myprestige %>% ggplot(aes(x = income)) + geom_histogram(bins = 100) + theme_minimal()
myprestige %>% ggplot(aes(x = income)) + geom_histogram(bins = 100, aes(y =..density..)) + theme_minimal() + geom_density(col = "red")
myprestige %>% ggplot(aes(x = income)) + geom_histogram(bins = 100, aes(y =..density..)) + theme_minimal() + geom_density(col = "red") + stat_function(fun=dnorm, color="blue", args=list(mean=mean(myprestige$income), sd=sd(myprestige$income)))
1.Plot a quantile-quantile plot (QQ plot) to “assess” normality.
Note: This plot compares the data we have (Sample Quantiles) with a theoretical sample coming from a normal distribution. Each point (x, y) corresponds to one of the quantiles of the second distribution (x-coordinate, theoretical) plotted against the same quantile of the first distribution (y-coordinate, our data). Thus the line is a parametric curve with the parameter which is the number of the interval for the quantile.
qqnorm(myprestige$income)
qqline(myprestige$income, col = "blue")
A standard way of transforming the data to be better approximated by a normal distribution is by using the log-transform?
myprestige <- myprestige %>%
mutate(incomeLog = log(income + 1)) %>%
mutate(income = NULL)
# plot
myprestige %>% ggplot(aes(x = incomeLog)) + geom_histogram(bins = 100, aes(y =..density..)) + theme_minimal() + geom_density(col = "red") + stat_function(fun=dnorm, color="blue", args=list(mean=mean(myprestige$income), sd=sd(myprestige$income)))
qqnorm(myprestige$incomeLog)
qqline(myprestige$incomeLog, col = "blue")
What happens if we only use complete data? How much data is missing?
Topics used here (but not explored): Subsetting data frames The apply family
dim(myprestige)
## [1] 102 7
dim(myprestige[complete.cases(myprestige), ])
## [1] 98 7
colSums(sapply(myprestige, is.na)) [colSums(sapply(myprestige, is.na)) > 0]
## type
## 4
We need to combine the datasets for imputation, so that we don’t have NAs in the test data as well!
Combine the testing and training data.
# this is not applicable to myprestige but need to show rbind here
myprestige2 <- rbind(myprestige, myprestige)
How do we impute the missing data?
Explore the data using the table() function (variable by variable).
table(myprestige2$type, useNA = "always")
##
## bc prof wc <NA>
## 88 62 46 8
Read the metadata file and see that many of the NAs should be recoded as None since these features are lacking in the house.
Recode the NA values that should be None using mutate() and fct_explicit_na().
myprestige2 <- myprestige2 %>% mutate(type = fct_explicit_na(type, na_level = "Ot"))
For the GarageYrBlt - set NA values using replace_na() to zero.
# no demo here
myprestige2 <- myprestige2 %>% replace_na(list(income = 0)) # if there were a zero income
For Lot frontage - set it to be the median for the neighborhood using group_by() and mutate().
# as a hint - use group_by() and mutate()
# will also need ifelse() function
myprestige2 %>% group_by(type) %>% summarise(incomeM = median(incomeLog))
## # A tibble: 4 x 2
## type incomeM
## <fct> <dbl>
## 1 bc 8.56
## 2 prof 9.09
## 3 wc 8.46
## 4 Ot 7.51
Split back into training (trainHC) and test (testHC) sets (because kaggle training set had prices, test didn’t).
# no demo here
# no demo here, just ggplot pipes
# then make dummy variable
myprestige2$nonsense <- myprestige2$women + myprestige2$education
Identify and remove outliers with a high total square foot, but low price.
myprestige2 %>% arrange(desc(education)) %>% head()
## education women prestige census type job incomeLog
## 1 15.97 19.59 84.6 2711 prof university.teachers 9.431963
## 2 15.97 19.59 84.6 2711 prof university.teachers 9.431963
## 3 15.96 10.56 87.2 3111 prof physicians 10.138915
## 4 15.96 10.56 87.2 3111 prof physicians 10.138915
## 5 15.94 4.32 66.7 3115 prof veterinarians 9.585965
## 6 15.94 4.32 66.7 3115 prof veterinarians 9.585965
## nonsense
## 1 35.56
## 2 35.56
## 3 26.52
## 4 26.52
## 5 20.26
## 6 20.26
myprestige2 %>% arrange(desc(education)) %>% head() %>% select(education, incomeLog, women)
## education incomeLog women
## 1 15.97 9.431963 19.59
## 2 15.97 9.431963 19.59
## 3 15.96 10.138915 10.56
## 4 15.96 10.138915 10.56
## 5 15.94 9.585965 4.32
## 6 15.94 9.585965 4.32
myprestige2 %>% arrange(desc(education)) %>% head() %>% filter(education >= 15.96)
## education women prestige census type job incomeLog
## 1 15.97 19.59 84.6 2711 prof university.teachers 9.431963
## 2 15.97 19.59 84.6 2711 prof university.teachers 9.431963
## 3 15.96 10.56 87.2 3111 prof physicians 10.138915
## 4 15.96 10.56 87.2 3111 prof physicians 10.138915
## nonsense
## 1 35.56
## 2 35.56
## 3 26.52
## 4 26.52
Does having more bedrooms increase sale price?
myprestige2 %>% ggplot(aes(x=as.factor(type), y = incomeLog)) + geom_boxplot() + theme_minimal()
Visualise both number of bedrooms (as a factor) and TotalSqFt as a scatterplot to see if a trend is visible.
myprestige2 %>% ggplot(aes(x=women, y = incomeLog, colour = as.factor(type))) + geom_point() + theme_minimal() + guides(col=guide_legend(title="JobType"))
Are newer or more recently renovated properties more expensive?
# no code
Lets convert kitchen quality to numeric (we’ll see why we need this later):
From the metadata we know it can be:
Recode this to numeric values using mutate() and recode().
myprestige2 <- myprestige2 %>% mutate(type = dplyr::recode(type, `prof` = 4L, `wc` = 3L, `bc` = 2L, `Ot` = 1L))
summary(myprestige2$type)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.794 4.000 4.000
# no need for code
What variables are correlated with each other and with price?
myprestige2num <- myprestige2[ , sapply(myprestige2, is.numeric)]
corrplot(cor(myprestige2num, use="everything"), method="circle", type="lower", sig.level = 0.01, insig = "blank")
corrplot(cor(myprestige2num, use="everything"), method="number", type="lower", sig.level = 0.01, insig = "blank")
Use the createDataPartition() function to separate the training data into a training and testing subset. Allocate 50% of the data to each class. Run set.seed(12) before this.
set.seed(12)
partitionD <- createDataPartition(y = myprestige2num$incomeLog, p = 0.5, list=FALSE)
myprestige2train <- myprestige2num[partitionD,]
myprestige2test <- myprestige2num[-partitionD,]
Fit a linear model considering the “top 10”" correlated (top 9, ignore LogSalePrice for obvious reasons). Code the variables (column names) manually.
lm_myprestige1 <- lm(incomeLog ~ education, data=myprestige2train)
lm_myprestige2 <- lm(incomeLog ~ education + women + prestige, data=myprestige2train)
summary(lm_myprestige1)
##
## Call:
## lm(formula = incomeLog ~ education, data = myprestige2train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.08902 -0.23147 0.05233 0.33801 0.81662
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.31736 0.20527 35.65 < 2e-16 ***
## education 0.12562 0.01858 6.76 9.35e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5083 on 100 degrees of freedom
## Multiple R-squared: 0.3137, Adjusted R-squared: 0.3068
## F-statistic: 45.7 on 1 and 100 DF, p-value: 9.354e-10
summary(lm_myprestige2)
##
## Call:
## lm(formula = incomeLog ~ education + women + prestige, data = myprestige2train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22509 -0.08212 0.06043 0.17939 0.44993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.783801 0.139658 55.735 < 2e-16 ***
## education -0.010393 0.023902 -0.435 0.665
## women -0.007319 0.001107 -6.614 1.98e-09 ***
## prestige 0.025594 0.003948 6.483 3.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3314 on 98 degrees of freedom
## Multiple R-squared: 0.714, Adjusted R-squared: 0.7052
## F-statistic: 81.55 on 3 and 98 DF, p-value: < 2.2e-16
prediction_lm1 <- predict(lm_myprestige1, myprestige2test, type="response")
prediction_lm2 <- predict(lm_myprestige2, myprestige2test, type="response")
# rmse?
rmse(myprestige2test$incomeLog, prediction_lm1)
## [1] 0.4814839
rmse(myprestige2test$incomeLog, prediction_lm2)
## [1] 0.2732863
All other models - just work in the housing template/final housing template files.
# randFor <- randomForest(LogSalePrice ~ ., data=trainHCtrain)
# # Predict using the test set
# prediction_rf <- predict(randFor, trainHCtest)
# trainHCtest$randFor <- prediction_rf
# # rmse?
# rmse(trainHCtest$LogSalePrice, trainHCtest$randFor)
# trainHCtrainNum <- as(as.matrix(trainHCtrain[ , sapply(trainHCtrain, is.numeric)]), "sparseMatrix")
# trainHCtestNum <- as(as.matrix(trainHCtest[ , sapply(trainHCtest, is.numeric)]), "sparseMatrix")
#
# trainD <- xgb.DMatrix(data = trainHCtrainNum, label = trainHCtrainNum[,"LogSalePrice"])
#
# #Cross validate the model
# cv.sparse <- xgb.cv(data = trainD,
# nrounds = 600,
# min_child_weight = 0,
# max_depth = 10,
# eta = 0.02,
# subsample = .7,
# colsample_bytree = .7,
# booster = "gbtree",
# eval_metric = "rmse",
# verbose = TRUE,
# print_every_n = 50,
# nfold = 4,
# nthread = 2,
# objective="reg:linear")
#
# #Train the model
# #Choose the parameters for the model
# param <- list(colsample_bytree = .7,
# subsample = .7,
# booster = "gbtree",
# max_depth = 10,
# eta = 0.02,
# eval_metric = "rmse",
# objective="reg:linear")
#
#
# #Train the model using those parameters
# bstSparse <-
# xgb.train(params = param,
# data = trainD,
# nrounds = 600,
# watchlist = list(train = trainD),
# verbose = TRUE,
# print_every_n = 50,
# nthread = 2)
#
# testD <- xgb.DMatrix(data = trainHCtestNum)
#
# prediction <- predict(bstSparse, testD) #Make the prediction based on the half of the training data set aside
#
# #Put testing prediction and test dataset all together
#
# prediction <- as.data.frame(as.matrix(prediction))
# colnames(prediction) <- "xgboost"
# trainHCtest$xgboost <- prediction$xgboost
#
#
# #Test with RMSE
# rmse(trainHCtest$LogSalePrice, trainHCtest$xgboost)
#
# # Feature importance
# importance_matrix <- xgb.importance(dimnames(trainD)[[2]], model = bstSparse)
# xgb.plot.importance(importance_matrix[1:10])
1.Use the glmnet library to train a ridge regression model. 2. Is it more or less accurate than XGBoost?
# trainHCtrainNumMatrix <- as.matrix(trainHCtrain[ , sapply(trainHCtrain, is.numeric)])
# trainHCtestNumMatrix <- as.matrix(trainHCtest[ , sapply(trainHCtest, is.numeric)])
# # cross validation for glmnet
# glm.cv.ridge <- cv.glmnet(trainHCtrainNum[,c(1:38,40)], trainHCtrainNum[,"LogSalePrice"], alpha = 0)
# penalty.ridge <- glm.cv.ridge$lambda.min
# glm.ridge <- glmnet(x = trainHCtrainNum[,c(1:38,40)], y = trainHCtrainNum[,"LogSalePrice"], alpha = 0, lambda = penalty.ridge )
# y_pred.ridge <- as.numeric(predict(glm.ridge, trainHCtestNum[,c(1:38,40)] ))
# rmse(trainHCtest$LogSalePrice, y_pred.ridge)