## Load Prestige data from the "car" package
library(car)
## Loading required package: carData
lm.fit <- lm(prestige ~ education, data = Prestige)
plot(lm.fit)
## Shapiro test of normality
shapiro.test(Prestige$prestige)
##
## Shapiro-Wilk normality test
##
## data: Prestige$prestige
## W = 0.97198, p-value = 0.02875
## Non-constant variance test (for the variable "education")
ncvTest(lm.fit, ~ education)
## Non-constant Variance Score Test
## Variance formula: ~ education
## Chisquare = 0.6327545, Df = 1, p = 0.42635
## The result doesn't seem to violate the constant variance assumption (for "education"), but "prestige" is certainly something that we should take a second look at.
Testing multi-collinearity and linear combination. Variance Inflation Factor (VIF) \(\frac{1}{1 - R^{2}_{ik}}\) and tolerance.
# Require olsrr package
# install.packages("olsrr")
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
data(mtcars)
model <- lm(mpg ~ disp + hp + wt + qsec, data = mtcars)
## Tolerance: 1 - R^2_{ik}, Percent of variance in a variable that cannot be accounted for by other variables
## VIF itself is expressed as 1/tolerance
## VIF >= 4: showing signs of collinearity, VIF >= 10: high collinearity, VIF = 1: no collinearity
## Which variable(s) exhibit collinearity?
ols_vif_tol(model)
## Variables Tolerance VIF
## 1 disp 0.1252279 7.985439
## 2 hp 0.1935450 5.166758
## 3 wt 0.1445726 6.916942
## 4 qsec 0.3191708 3.133119
# Finding the largest source of collinearity: eigenvalue
ols_eigen_cindex(model)
## Eigenvalue Condition Index intercept disp hp wt
## 1 4.721487187 1.000000 0.000123237 0.001132468 0.001413094 0.0005253393
## 2 0.216562203 4.669260 0.002617424 0.036811051 0.027751289 0.0002096014
## 3 0.050416837 9.677242 0.001656551 0.120881424 0.392366164 0.0377028008
## 4 0.010104757 21.616057 0.025805998 0.777260487 0.059594623 0.7017528428
## 5 0.001429017 57.480524 0.969796790 0.063914571 0.518874831 0.2598094157
## qsec
## 1 0.0001277169
## 2 0.0046789491
## 3 0.0001952599
## 4 0.0024577686
## 5 0.9925403056
##### How to read this table #####
# Each row represents the eigenvalue of a unique linear combination of variables
# column 3-7 are variance proportion, where each column entry (of a row) indicates the proportion of variance in the eigenvalue accounted for by a variable (column 3-7). High variance proportion (> .5) indicates high collinearity. m[4, 4], m[5, 5], m[4, 6], and m[5, 7] are suspicious!
# Condition index: sqrt(max eigenvalue / eigenvalue_i). Condition index > 30 indicates high collinearity in the data
# Collinearity makes significant presence after the 4th CIs
# Do you know how to calculate the "Condition Index"?
Visualizing the effect of \(X_1\) on \(Y\) conditional on \(X_2\). In some cases, the effect of \(X_2\) (or \(X_1\)) on \(Y\) is not linear, non-additive, such as when \(X_2\) is a non-continuous variable. Think about income bracket (ordinal scale), GPA scale, country/region (categorical variable).
## Install the effects and other required packages
# install.packages("effects")
library(effects)
## Warning: package 'effects' was built under R version 4.4.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(car)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
library(splines)
library(ISLR)
## Import Carseats data from ISLR package
data(Carseats)
## Check out variable names
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
## Different ways to specify an interaction term
# If you enter x1*x2, R will interpret this as x1 + x2 + x1:x2
# x1:x2 means multiply the range of values of x1 across the range of values of x2
# You may also generate an interaction term x12 by assigning a new variable x12 <- x1*x2 to the existing dataframe and then include it, along with x1 and x2, in your model, which is a direct mapping of each observation's x1 and x2 values.
# We will examine their difference at below
fit1 <- lm(Sales ~ Income + Price + Advertising + Education + Education:Income, data = Carseats)
summary(fit1)
##
## Call:
## lm(formula = Sales ~ Income + Price + Advertising + Education +
## Education:Income, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3040 -1.5650 -0.0339 1.5518 6.5842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.5646255 1.7817805 7.613 2.00e-13 ***
## Income -0.0026570 0.0227588 -0.117 0.907
## Price -0.0535060 0.0050812 -10.530 < 2e-16 ***
## Advertising 0.1195676 0.0180238 6.634 1.08e-10 ***
## Education -0.1020783 0.1220535 -0.836 0.403
## Income:Education 0.0009845 0.0016236 0.606 0.545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 394 degrees of freedom
## Multiple R-squared: 0.2954, Adjusted R-squared: 0.2865
## F-statistic: 33.04 on 5 and 394 DF, p-value: < 2.2e-16
fit2 <- lm(Sales ~ Income + Price + Advertising + Education + Education*Income, data = Carseats)
summary(fit2) # R knows that it's redundant to also list Income + Education in the model if you already include the two variables via Income*Education, so R only reports their estimates ONCE. You can compare it with the output of fit3
##
## Call:
## lm(formula = Sales ~ Income + Price + Advertising + Education +
## Education * Income, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3040 -1.5650 -0.0339 1.5518 6.5842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.5646255 1.7817805 7.613 2.00e-13 ***
## Income -0.0026570 0.0227588 -0.117 0.907
## Price -0.0535060 0.0050812 -10.530 < 2e-16 ***
## Advertising 0.1195676 0.0180238 6.634 1.08e-10 ***
## Education -0.1020783 0.1220535 -0.836 0.403
## Income:Education 0.0009845 0.0016236 0.606 0.545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 394 degrees of freedom
## Multiple R-squared: 0.2954, Adjusted R-squared: 0.2865
## F-statistic: 33.04 on 5 and 394 DF, p-value: < 2.2e-16
fit3 <- lm(Sales ~ Price + Advertising + Education*Income, data = Carseats)
summary(fit3)
##
## Call:
## lm(formula = Sales ~ Price + Advertising + Education * Income,
## data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3040 -1.5650 -0.0339 1.5518 6.5842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.5646255 1.7817805 7.613 2.00e-13 ***
## Price -0.0535060 0.0050812 -10.530 < 2e-16 ***
## Advertising 0.1195676 0.0180238 6.634 1.08e-10 ***
## Education -0.1020783 0.1220535 -0.836 0.403
## Income -0.0026570 0.0227588 -0.117 0.907
## Education:Income 0.0009845 0.0016236 0.606 0.545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 394 degrees of freedom
## Multiple R-squared: 0.2954, Adjusted R-squared: 0.2865
## F-statistic: 33.04 on 5 and 394 DF, p-value: < 2.2e-16
# Now we formally fit the model and feed the output into Effect() function for diagnostic.
car.fit <- lm(Sales ~ Income + Price + Advertising + Education + Education:Income, data = Carseats)
summary(car.fit)
##
## Call:
## lm(formula = Sales ~ Income + Price + Advertising + Education +
## Education:Income, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3040 -1.5650 -0.0339 1.5518 6.5842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.5646255 1.7817805 7.613 2.00e-13 ***
## Income -0.0026570 0.0227588 -0.117 0.907
## Price -0.0535060 0.0050812 -10.530 < 2e-16 ***
## Advertising 0.1195676 0.0180238 6.634 1.08e-10 ***
## Education -0.1020783 0.1220535 -0.836 0.403
## Income:Education 0.0009845 0.0016236 0.606 0.545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.386 on 394 degrees of freedom
## Multiple R-squared: 0.2954, Adjusted R-squared: 0.2865
## F-statistic: 33.04 on 5 and 394 DF, p-value: < 2.2e-16
## Check out the interaction effect between Education and Income
## Use focal.predictors() to select the variables whose interaction effect you want to evaluate
# This produces an interaction effect matrix
Effect(focal.predictors = c("Education", "Income"), car.fit)
##
## Education*Income effect
## Income
## Education 21 46 70 95 120
## 10 7.292383 7.472072 7.644573 7.824262 8.003951
## 12 7.129573 7.358485 7.578240 7.807151 8.036063
## 14 6.966764 7.244898 7.511907 7.790041 8.068175
## 16 6.803954 7.131311 7.445573 7.772930 8.100286
## 18 6.641145 7.017724 7.379240 7.755819 8.132398
## Plotting
## rug argument displays ticks at the bottom x-axis of the plot, revealing the density distribution of the data
plot(Effect(focal.predictors = c("Education", "Income"), car.fit), rug = TRUE, main = "Effect of Education on Sales moderated by Income level")
# See? The effect of Education on Sales vary by Income level. As Income level becomes higher, the effect of Education turns from negative to slightly positive.
## Sometimes the effect of a variable is not additive because it's not a continuous variable.
## Load World Values Survey (WVS) data, the data comes from the effects package
head(WVS)
## poverty religion degree country age gender
## 1 Too Little yes no USA 44 male
## 2 About Right yes no USA 40 female
## 3 Too Little yes no USA 36 female
## 4 Too Much yes yes USA 25 female
## 5 Too Little yes yes USA 39 male
## 6 About Right yes no USA 80 female
# polr() fits a regression model to an ordered (ordinal) factor response (poverty).
wvs.fit <- polr(poverty ~ country*(gender + religion + degree + age), data = WVS)
## Pay attention to the country-specific "fixed effects." How does the relationship between age and poverty vary by "country"? What might cause this variation at country level?
plot(Effect(focal.predictors = c("age","country"), wvs.fit), rug = TRUE, main = "The effect of Age on Poverty by Country")
##
## Re-fitting to get Hessian
Techniques and visualization methods
# install.packages("factoextra")
# install.packages("stats")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(stats)
## prcomp uses the singular value decomposition (SVD), while princomp uses the spectral decomposition method
## A quick first cut on the difference between the two is that SVD is defined for all matrices (rectangular or square), while the more commonly used spectral decomposition only applies to square matrices
data(mtcars)
names(mtcars) ## Check if the data contains non-numerical values or indicator variables, which cannot be used in PCA
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
prcomp.pca <- prcomp(mtcars, scale = TRUE) ## Whether the variables should be scaled to have unit variance, note that PCA is VERY sensitive to scaling.
class(prcomp.pca)
## [1] "prcomp"
princomp.pca <- princomp(mtcars, cor = TRUE, scores = TRUE) ## Whether the data should be scaled (cor) and have the score of each principal component (PC) calculated.
class(princomp.pca)
## [1] "princomp"
## Use factoextra for plotting, adding PC scores, now let the "scree plot" tell you how many PCs are there...
fviz_eig(prcomp.pca, addlabels = TRUE)
# Do the top 2 PCs explain more than 50% of the variance in the data?
## Projecting the PCs, positively correlated variables point to the same side of the plot, while negatively correlated variables point to the opposite sides of the plot
fviz_pca_var(prcomp.pca, col.var="contrib", # color intensity indicates the contribution of each variable to a PC (in lieu of percentage of variance explained)
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), # set color scale
repel = TRUE # Avoid text overlapping
)
## Contribution of each variable to the top 2 PCs
fviz_contrib(prcomp.pca, choice = "var", axes = 1, top = 10) # largest PC (axes=1)
fviz_contrib(prcomp.pca, choice = "var", axes = 2, top = 10) # second largest PC (axes=2)
# The rank of each variable's contribution to the second PC should be in reverse order to its contribution to PC1. (Why?)
## There are easier ways to do this, for example, by using the corrplot package
#install.packages("corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
data(mtcars)
corr <- cor(mtcars) ## Create correlation matrix of the data
par(ask = TRUE) ## Pop up plotting screen
## Plot correlation scores only on the lower triangular matrix and choose your preferred color scheme
corrplot(corr, method="color", order = "original", addCoef.col = grey(0.1), type = "lower", title = "Visualizing correlation matrix of mtcars data") ## Plotting
cor.mtest(corr) ## You can also test the p-values and confidence intervals for each pair of input (i.e., two variables) features.
## $p
## mpg cyl disp hp drat
## mpg 0.000000e+00 4.037623e-09 1.091445e-09 4.322152e-06 1.780708e-05
## cyl 4.037623e-09 0.000000e+00 1.659424e-09 5.625666e-07 5.174268e-05
## disp 1.091445e-09 1.659424e-09 0.000000e+00 1.304240e-05 4.935086e-06
## hp 4.322152e-06 5.625666e-07 1.304240e-05 0.000000e+00 2.049276e-03
## drat 1.780708e-05 5.174268e-05 4.935086e-06 2.049276e-03 0.000000e+00
## wt 2.518729e-08 1.320552e-06 1.901561e-08 1.787328e-04 3.782478e-07
## qsec 1.471451e-02 5.949087e-03 1.829871e-02 3.509603e-04 1.426220e-01
## vs 3.020659e-05 2.026977e-06 3.450779e-05 3.690380e-08 3.468419e-03
## am 1.704116e-03 4.379896e-03 1.122485e-03 3.593445e-02 1.229604e-05
## gear 5.780693e-03 9.922460e-03 3.028291e-03 7.040180e-02 6.018579e-05
## carb 3.193816e-03 2.006581e-03 6.969729e-03 4.718601e-05 6.998155e-02
## wt qsec vs am gear
## mpg 2.518729e-08 1.471451e-02 3.020659e-05 1.704116e-03 5.780693e-03
## cyl 1.320552e-06 5.949087e-03 2.026977e-06 4.379896e-03 9.922460e-03
## disp 1.901561e-08 1.829871e-02 3.450779e-05 1.122485e-03 3.028291e-03
## hp 1.787328e-04 3.509603e-04 3.690380e-08 3.593445e-02 7.040180e-02
## drat 3.782478e-07 1.426220e-01 3.468419e-03 1.229604e-05 6.018579e-05
## wt 0.000000e+00 5.627606e-02 4.608706e-04 1.431601e-04 7.977054e-04
## qsec 5.627606e-02 0.000000e+00 1.673365e-04 5.473798e-01 6.815004e-01
## vs 4.608706e-04 1.673365e-04 0.000000e+00 5.385274e-02 8.393306e-02
## am 1.431601e-04 5.473798e-01 5.385274e-02 0.000000e+00 1.702701e-07
## gear 7.977054e-04 6.815004e-01 8.393306e-02 1.702701e-07 0.000000e+00
## carb 1.713296e-02 8.438162e-06 9.355873e-05 3.055664e-01 4.848884e-01
## carb
## mpg 3.193816e-03
## cyl 2.006581e-03
## disp 6.969729e-03
## hp 4.718601e-05
## drat 6.998155e-02
## wt 1.713296e-02
## qsec 8.438162e-06
## vs 9.355873e-05
## am 3.055664e-01
## gear 4.848884e-01
## carb 0.000000e+00
##
## $lowCI
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.9976823 -0.9982698 -0.9888091 0.7778575 -0.9965075
## cyl -0.9976823 1.0000000 0.9700514 0.8931885 -0.9801076 0.8717271
## disp -0.9982698 0.9700514 1.0000000 0.7918066 -0.9884636 0.9488324
## hp -0.9888091 0.8931885 0.7918066 1.0000000 -0.9514038 0.6445032
## drat 0.7778575 -0.9801076 -0.9884636 -0.9514038 1.0000000 -0.9935715
## wt -0.9965075 0.8717271 0.9488324 0.6445032 -0.9935715 1.0000000
## qsec 0.1885083 -0.9359355 -0.9129200 -0.9686117 -0.1782837 -0.8786800
## vs 0.7521771 -0.9905881 -0.9819081 -0.9961948 0.3723792 -0.9664627
## am 0.4497653 -0.9408983 -0.9582352 -0.8941274 0.7943524 -0.9746951
## gear 0.3115428 -0.9265023 -0.9462972 -0.8698401 0.7145584 -0.9616299
## carb -0.9455549 0.4327021 0.2881435 0.7283968 -0.8700876 0.1669910
## qsec vs am gear carb
## mpg 0.1885083 0.752177112 0.449765308 0.31154285 -0.9455549
## cyl -0.9359355 -0.990588096 -0.940898308 -0.92650229 0.4327021
## disp -0.9129200 -0.981908097 -0.958235185 -0.94629724 0.2881435
## hp -0.9686117 -0.996194795 -0.894127367 -0.86984015 0.7283968
## drat -0.1782837 0.372379155 0.794352378 0.71455837 -0.8700876
## wt -0.8786800 -0.966462744 -0.974695144 -0.96162988 0.1669910
## qsec 1.0000000 0.649145621 -0.451068301 -0.50207431 -0.9869495
## vs 0.6491456 1.000000000 -0.008706491 -0.08354933 -0.9771268
## am -0.4510683 -0.008706491 1.000000000 0.91749704 -0.7808657
## gear -0.5020743 -0.083549333 0.917497040 1.00000000 -0.7321880
## carb -0.9869495 -0.977126817 -0.780865674 -0.73218796 1.0000000
##
## $uppCI
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.9635787 -0.9726924 -0.83492892 0.98449032 -0.94558734
## cyl -0.9635787 1.0000000 0.9981001 0.99296695 -0.72322813 0.99146342
## disp -0.9726924 0.9981001 1.0000000 0.98556954 -0.83023726 0.99672088
## hp -0.8349289 0.9929670 0.9855695 1.00000000 -0.43047203 0.97331807
## drat 0.9844903 -0.7232281 -0.8302373 -0.43047203 1.00000000 -0.90194717
## wt -0.9455873 0.9914634 0.9967209 0.97331807 -0.90194717 1.00000000
## qsec 0.9180840 -0.3079903 -0.1575524 -0.59373847 0.83538327 0.01595598
## vs 0.9824617 -0.8594408 -0.7452884 -0.94085090 0.94438092 -0.57151075
## am 0.9536222 -0.3451431 -0.4914487 -0.05617533 0.98576482 -0.65992608
## gear 0.9364210 -0.2422846 -0.3877983 0.05343045 0.97938823 -0.52355311
## carb -0.3817880 0.9516630 0.9331780 0.98053328 0.05241567 0.91452061
## qsec vs am gear carb
## mpg 0.91808400 0.9824617 0.95362221 0.93642103 -0.38178798
## cyl -0.30799033 -0.8594408 -0.34514312 -0.24228456 0.95166303
## disp -0.15755245 -0.7452884 -0.49144868 -0.38779827 0.93317802
## hp -0.59373847 -0.9408509 -0.05617533 0.05343045 0.98053328
## drat 0.83538327 0.9443809 0.98576482 0.97938823 0.05241567
## wt 0.01595598 -0.5715107 -0.65992608 -0.52355311 0.91452061
## qsec 1.00000000 0.9737351 0.71623124 0.68252616 -0.80994159
## vs 0.97373509 1.0000000 0.88032208 0.86227799 -0.68782370
## am 0.71623124 0.8803221 1.00000000 0.99463195 0.32597570
## gear 0.68252616 0.8622780 0.99463195 1.00000000 0.42393232
## carb -0.80994159 -0.6878237 0.32597570 0.42393232 1.00000000
library(ISLR)
library(caret) # A package by Kuhn and Johnson, useful for partitioning data
## Loading required package: lattice
# Select complete cases only
data(airquality)
# Check out the original data dimension
dim(airquality)
## [1] 153 6
airq <- complete.cases(airquality) ## We ask R to retain a subset (rows) of the dataframe without missing values in any of the columns
names(airq)
## NULL
# What does the message "NULL" tell us? This means all of the columns contain missing values in at least one of their row entries!
## Check out the number of NA in variable "Ozone"
sum(is.na(airquality$Ozone))
## [1] 37
# We have 37 NAs! (37 rows of Ozone contain missing values)
## Understanding different na actions
## na.omit vs. na.exclude (na.omit will automatically drop rows with missing values, na.omit is also the default option setting in most R regression functions)
omit.fit <- lm(Temp ~ Ozone + Wind, data = airquality, na.action = na.omit)
length(residuals(omit.fit)) # It has 116 residuals, indicating those observations (rows) without missing values (in any of the data columns). The original dataframe has 153 (= 116 (complete) + 37 (missing)) rows.
## [1] 116
exclude.fit <- lm(Temp ~ Ozone + Wind, data = airquality, na.action = na.exclude)
length(residuals(exclude.fit)) # We have 153 predicted values (same as the row length of the original dataframe), but some of them are padded with NAs, making them unusable!
## [1] 153
residuals(exclude.fit)
## 1 2 3 4 5 6
## -11.60313615 -5.49541535 2.47230176 -11.00071306 NA -7.47602470
## 7 8 9 10 11 12
## -9.97849884 -13.30679755 -6.98593198 NA 1.19680109 -4.32933455
## 13 14 15 16 17 18
## -6.63773179 -4.52308868 -14.35762156 -8.29611520 -9.62995994 -11.27672455
## 19 20 21 22 23 24
## -7.11450664 -10.44858723 -11.68709258 3.16160771 -10.21554097 -14.27766101
## 25 26 27 28 29 30
## NA NA NA -6.69231583 4.52943440 -13.28128804
## 31 32 33 34 35 36
## -1.89853829 NA NA NA NA NA
## 37 38 39 40 41 42
## NA 6.38072241 NA 8.53343028 10.30014818 NA
## 43 44 45 46 47 48
## NA 6.79452769 NA NA 4.75702155 -0.86729298
## 49 50 51 52 53 54
## -9.22307697 1.05618373 3.42608732 NA NA NA
## 55 56 57 58 59 60
## NA NA NA NA NA NA
## 61 62 63 64 65 66
## NA -12.40953993 5.66858855 4.66312945 NA -0.71378337
## 67 68 69 70 71 72
## 5.89702524 2.18541814 3.11637579 2.88940232 2.64628740 NA
## 73 74 75 76 77 78
## 2.46769219 7.70012477 NA 9.99614059 0.97467304 5.55079909
## 79 80 81 82 83 84
## 1.45775653 0.83311922 4.07256102 -0.38854409 NA NA
## 85 86 87 88 89 90
## 0.98098167 -5.17817681 7.54994956 7.19934970 2.17473580 5.81151867
## 91 92 93 94 95 96
## 0.34542616 -0.09290609 2.56001822 10.45469710 7.80060047 0.69018910
## 97 98 99 100 101 102
## 7.45376064 2.93391770 -5.15742580 4.03872800 -0.53047574 NA
## 103 104 105 106 107 108
## NA 8.41940085 7.23779229 -1.96065832 NA 2.84074213
## 109 110 111 112 113 114
## -3.18994454 0.56755422 2.48237042 -0.03454609 4.98399503 1.64384166
## 115 116 117 118 119 120
## NA 0.56233097 -21.48727465 1.98705445 NA 13.10169756
## 121 122 123 124 125 126
## -0.09591945 9.40631884 7.23016937 2.51949873 6.00926868 7.01995103
## 127 128 129 130 131 132
## 4.53018108 7.33996706 10.04635091 6.42001454 3.66459267 1.24386507
## 133 134 135 136 137 138
## -1.73853027 4.70558386 3.98399503 0.27068886 -0.64234135 -1.11996574
## 139 140 141 142 143 144
## -1.67302803 -5.13064808 3.42608732 -6.51155680 8.02757394 -7.70384771
## 145 146 147 148 149 150
## -3.75152536 4.37464963 -2.51701589 -7.36684069 -6.85463659 NA
## 151 152 153
## 3.76309433 1.67527502 -5.35301199
## na.pass keep the original rows with NA values, this will sometimes cause your model unable to execute, you can see the error message for yourself (run the following line of code on your own console)
# pass.fit <- lm(Temp ~ Ozone + Wind, data = airquality, na.action = na.pass)
# Use "complete.cases" as filtering condition: subset airquality dataframe by rows with complete obs (i.e., without NAs)
# Here we put the subsetting condition argument in the row vector of the dataframe
airquality.complete <- airquality[complete.cases(airquality),]
# Replace NAs in the new dataframe by 0 if this condition is satisfied
airquality.complete[is.na(airquality.complete)] <- 0
# Do missing values still exist?
sum(is.na(airquality.complete$Ozone))
## [1] 0
## Replace all NA cells with 0
# Generate an example dataframe df
df = data.frame(C1= c(1, 5, 14, NA, 54), C2= c(9, NA, NA, 3, 42), C3= c(9, 7, 42, 87, NA))
df
## C1 C2 C3
## 1 1 9 9
## 2 5 NA 7
## 3 14 NA 42
## 4 NA 3 87
## 5 54 42 NA
Replace the entire C1 column’s NA values with 0
# Replicate a new dataframe for demonstration
df1 <- df
df1$C1[is.na(df1$C1)] <- 0
df1
## C1 C2 C3
## 1 1 9 9
## 2 5 NA 7
## 3 14 NA 42
## 4 0 3 87
## 5 54 42 NA
Conditional replacement. Replace the values in C1 column with 0 based on whether the C2 column of the same dataframe contains NA values.
# replicate a new dataframe
df2 <- df
df2$C1[is.na(df2$C2)] <- 0
df2
## C1 C2 C3
## 1 1 9 9
## 2 0 NA 7
## 3 0 NA 42
## 4 NA 3 87
## 5 54 42 NA
We then try to fill in NA values with the mean of non-NA entries from the same column (hence, \(\textsf{na.rm = TRUE}\)). We will use for-loop to implement this. Note the use of row and column indices inside the square brackets df[row, col].
for(i in 1:ncol(df)){ # Repeat for each column
df[is.na(df[,i]), i] <- mean(df[,i], na.rm = TRUE)
# Fill in rows of the i-th column of this dataframe that contains NAs with the mean value of the i-th column without NAs
}
df
## C1 C2 C3
## 1 1.0 9 9.00
## 2 5.0 18 7.00
## 3 14.0 18 42.00
## 4 18.5 3 87.00
## 5 54.0 42 36.25
# na.rm means removing NA values from the calculation, since missing values contribute no information to the calculation
Why train-test-split?
library(caret)
data(Auto) # Load the data
head(Auto) # Check out column names
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
set.seed(100) # So we can replicate the same sampling result later, think about rolling a dice in the same way
# Create a stratified random sample based on explanatory variable and partition the data using caret's built-in function
train_index <- createDataPartition(Auto$horsepower, p = .75, list = FALSE)
# "list = FALSE" asks R NOT to return the result into a list object
training <- Auto[train_index,]
testing <- Auto[-train_index,] # The filtering condition follows Boolean logic, meaning that for all those NOT (-) in train_index, they will be assigned to the testing set
## To be sure, there are other ways to do train-test split (though some of them may look less intuitive), if you would like. They are not based on the explanatory variable of interest
# train_index <- sample(1:nrow(Auto), 0.75 * nrow(Auto))
# test_index <- setdiff(1:nrow(Auto), train_index) The elements of setdiff(x,y) are those elements in the first argument but not in the second argument
## or
# train_index <- floor(0.75 * nrow(Auto))
# train <- Auto[train_index,]
# test <- Auto[-train_index,]
## Testing mpg ~ horsepower on two sub-samples
training.fit <- lm(mpg ~ horsepower, data = training)
pred <- predict(training.fit, testing) # Predict on the testing set using the trained model
comparison <- cbind(testing$mpg, pred) # Combine the (actual) values (i.e., your Y variable) from the testing set with the predicted values. Note that the two objects must have the same row length. (Why?)
head(comparison, 20) # compare
## pred
## 3 18 16.249496
## 6 15 8.673437
## 9 14 4.411903
## 10 15 9.936114
## 11 15 13.092805
## 26 10 5.990249
## 30 27 26.035240
## 32 25 24.930398
## 38 18 24.141225
## 56 27 30.454608
## 60 23 31.401615
## 62 21 26.350909
## 66 14 15.775993
## 69 13 15.460324
## 71 13 9.936114
## 79 21 26.193075
## 84 28 27.297917
## 89 14 18.301346
## 91 12 8.673437
## 96 12 4.411903
Simulating three Beta distributions with different parameter values.
Beta distribution has the following form: \(f(x) = \frac{1}{\beta(\alpha, \beta)}x^{\alpha-1}(1 - \alpha)^{\beta-1}\) \(\alpha\) and \(\beta\) are shape parameters that determine the Beta distribution’s form, influencing its mean, variance, and overall shape, such as whether it’s skewed or symmetric.
## set up plotting environment: 1 row by 3 col
par(mfrow=c(1, 3))
## set up width and height
par(pin=c(2, 1.8))
## rbeta(n, alpha, beta)
x1 <- rbeta(10000, 5, 5)
hist(x1, freq = F, main = "Beta normal distribution", xlab = expression(paste("X ~ (", alpha, "=5, ", beta,
"=5)")))
lines(density(x1), col="red")
x2 <- rbeta(10000, 5, 2)
hist(x2, freq = F, main = "Left-skewed distribution", xlab = expression(paste("X ~ (", alpha, "=5, ", beta,
"=2)")))
lines(density(x2), col="red")
x3 <- rbeta(10000, 2, 5)
hist(x3, freq = F, main = "Right-skewed distribution", xlab = expression(paste("X ~ (", alpha, "=2, ", beta,
"=5)")))
lines(density(x3), col="red")
## 3rd degree polynomial fit on the raw data
fitlm = lm(dist ~ speed, data = cars) # linear
fitsq = lm(dist~poly(speed,2,raw = TRUE), data=cars) # squared
fitcu = lm(dist~poly(speed,3,raw = TRUE), data=cars) # cubic
par(mar = c(1, 1, 1, 1)) # "mar" means plotting margin c(bottom, left, top, right)
plot(cars$dist~cars$speed, pch=19,
xlab = "Car Speed (mph)",
ylab = "Distance Covered (ft)",
main = "3rd degree polynomial on raw data",
las=1)
# specify line width (lwd) and line type (lty)
lines(cars$speed, predict(fitlm), col = "blue", lwd = 2, type = "l", lty = 1)
lines(cars$speed, predict(fitsq), col = "green", lwd = 2, type = "l", lty = 2)
lines(cars$speed, predict(fitcu), col = "red", lwd = 2, type = "l", lty = 3)
legend(5, 120, legend=c("linear", "2nd degree", "3rd degree"),
col=c("blue", "green", "red"), lty=c(1, 2, 3), cex=0.8)