getwd()
[1] "/cloud/project"
## Space Shuttle Launch Data
#stores the dataset in the variable launch
launch <- read.csv("/cloud/project/challenger2.csv")
#estimate beta manually
b <- cov(launch$temperature, launch$distress_ct) / var(launch$temperature)
b
[1] -0.03364796
# estimate alpha manually
a <- mean(launch$distress_ct) - b * mean(launch$temperature)
a
[1] 2.814585
# calculate the correlation of launch data
r <- cov(launch$temperature, launch$distress_ct) /
(sd(launch$temperature) * sd(launch$distress_ct))
r
[1] -0.3359996
#Here we obtain the correlation between temperature and distress. As we can see, we have a
#negative correlation
cor(launch$temperature, launch$distress_ct)
[1] -0.3359996
# computing the slope using correlation
r * (sd(launch$distress_ct) / sd(launch$temperature))
[1] -0.03364796
# confirming the regression line using the lm function. The lm function is built into r.
#note the dependent variableis distress and the independent variable is temperature. The values obtain via the linear regression model are very similar to the alpha and beta we obtained manually.
model <- lm(distress_ct ~ temperature, data = launch)
model
Call:
lm(formula = distress_ct ~ temperature, data = launch)
Coefficients:
(Intercept) temperature
2.81458 -0.03365
#this function provides us with the summary of the model
summary(model)
Call:
lm(formula = distress_ct ~ temperature, data = launch)
Residuals:
Min 1Q Median 3Q Max
-1.0649 -0.4929 -0.2573 0.3052 1.7090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.81458 1.24629 2.258 0.0322 *
temperature -0.03365 0.01815 -1.854 0.0747 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7076 on 27 degrees of freedom
Multiple R-squared: 0.1129, Adjusted R-squared: 0.08004
F-statistic: 3.436 on 1 and 27 DF, p-value: 0.07474
# creating a simple multiple regression function
reg <- function(y, x) {
x <- as.matrix(x) # converts x to a matrix
x <- cbind(Intercept = 1, x)# add a column of the 1's to the matrix to account for
b <- solve(t(x) %*% x) %*% t(x) %*% y # apply the normal equation to compute the regress
colnames(b) <- "estimate" # Name the column ofthe resulting coefficient vec
print(b)# Print the estimated regression coefffcients
}
# examine the launch data
#the str() function is used to display the structure of an R object. It provides a compact summary
#of the object's type, dimensions, and the first few elements or values. This makes it easier to understand
#the data
str(launch)
'data.frame': 29 obs. of 4 variables:
$ distress_ct : int 0 1 0 0 0 0 0 0 1 1 ...
$ temperature : int 66 70 69 68 67 72 73 70 57 63 ...
$ field_check_pressure: int 50 50 50 50 50 50 100 100 200 200 ...
$ flight_num : int 1 2 3 4 5 6 7 8 9 10 ...
# test regression model with simple linear regression
reg(y = launch$distress_ct, x = launch[2])
estimate
Intercept 2.81458456
temperature -0.03364796
# use regression model with multiple regression
#this expression is R calling teh reg function but with the arguments y and x being passed specific data from the launch object. y=launch$distress_ct specifies the dependent variable for the regression. Meanwhile,
# x= launch[2:4] specifies the independent variable for the regression. The reg function we defined will take the arguments y will be the distress_ct columnn and x will be subset ofcolummns 2 to 4 of the launch dataframe.
reg(y = launch$distress_ct, x = launch[2:4])
estimate
Intercept 2.239817e+00
temperature -3.124185e-02
field_check_pressure -2.586765e-05
flight_num 2.762455e-02
# confirming the multiple regression result using the lm function (not in text) In R, multiple regression offers important insights into the relationships between a dependent (response) variable and two or more independent (predictor) variables.
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num, data = launch)
model
Call:
lm(formula = distress_ct ~ temperature + field_check_pressure +
flight_num, data = launch)
Coefficients:
(Intercept) temperature
2.240e+00 -3.124e-02
field_check_pressure flight_num
-2.587e-05 2.762e-02
#####Predicting Medical Expenses
## Step 2: Exploring and preparing the data ----
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
'data.frame': 1338 obs. of 7 variables:
$ age : int 19 18 28 33 32 31 46 37 37 60 ...
$ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
$ bmi : num 27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
$ children: int 0 1 3 0 0 0 1 3 2 0 ...
$ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
$ expenses: num 16885 1726 4449 21984 3867 ...
# summarize the charges variable
summary(insurance$expenses)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1122 4740 9382 13270 16640 63770
# histogram of insurance charges
hist(insurance$expenses)
# table of region
table(insurance$region)
northeast northwest southeast southwest
324 325 364 325
# exploring relationships among features: correlation matrix
cor(insurance[c("age", "bmi", "children", "expenses")])
age bmi children expenses
age 1.0000000 0.10934101 0.04246900 0.29900819
bmi 0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
# visualing relationships among features: scatterplot matrix
pairs(insurance[c("age", "bmi", "children", "expenses")])
## Step 3: Training a model on the data ----
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above
# see the estimated beta coefficients
ins_model
Call:
lm(formula = expenses ~ ., data = insurance)
Coefficients:
(Intercept) age sexmale
-11941.6 256.8 -131.4
bmi children smokeryes
339.3 475.7 23847.5
regionnorthwest regionsoutheast regionsouthwest
-352.8 -1035.6 -959.3
#Step 4 model of performance
# see more detail about the estimated beta coefficients
summary(ins_model)
Call:
lm(formula = expenses ~ ., data = insurance)
Residuals:
Min 1Q Median 3Q Max
-11302.7 -2850.9 -979.6 1383.9 29981.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11941.6 987.8 -12.089 < 2e-16 ***
age 256.8 11.9 21.586 < 2e-16 ***
sexmale -131.3 332.9 -0.395 0.693255
bmi 339.3 28.6 11.864 < 2e-16 ***
children 475.7 137.8 3.452 0.000574 ***
smokeryes 23847.5 413.1 57.723 < 2e-16 ***
regionnorthwest -352.8 476.3 -0.741 0.458976
regionsoutheast -1035.6 478.7 -2.163 0.030685 *
regionsouthwest -959.3 477.9 -2.007 0.044921 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
F-statistic: 500.9 on 8 and 1329 DF, p-value: < 2.2e-16
## Step 5: Improving model performance
# add a higher-order "age" term
insurance$age2 <- insurance$age^2
# add an indicator for BMI >= 30
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
# create final model
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = insurance)
summary(ins_model2)
Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 *
smoker + region, data = insurance)
Residuals:
Min 1Q Median 3Q Max
-17297.1 -1656.0 -1262.7 -727.8 24161.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 139.0053 1363.1359 0.102 0.918792
age -32.6181 59.8250 -0.545 0.585690
age2 3.7307 0.7463 4.999 6.54e-07 ***
children 678.6017 105.8855 6.409 2.03e-10 ***
bmi 119.7715 34.2796 3.494 0.000492 ***
sexmale -496.7690 244.3713 -2.033 0.042267 *
bmi30 -997.9355 422.9607 -2.359 0.018449 *
smokeryes 13404.5952 439.9591 30.468 < 2e-16 ***
regionnorthwest -279.1661 349.2826 -0.799 0.424285
regionsoutheast -828.0345 351.6484 -2.355 0.018682 *
regionsouthwest -1222.1619 350.5314 -3.487 0.000505 ***
bmi30:smokeryes 19810.1534 604.6769 32.762 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared: 0.8664, Adjusted R-squared: 0.8653
F-statistic: 781.7 on 11 and 1326 DF, p-value: < 2.2e-16
# making predictions with the regression model
insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)
[1] 0.9307999
plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2)
plot(insurance$pred, insurance$expenses)
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 2,
bmi = 30, sex = "male", bmi30 = 1,
smoker = "no", region = "northeast"))
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 2,
bmi = 30, sex = "female", bmi30 = 1,
smoker = "no", region = "northeast"))
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 0,
bmi = 30, sex = "female", bmi30 = 1,
smoker = "no", region = "northeast"))
Part 2: Regression Trees and Model Trees Understanding regression trees and model trees Example: Calculating SDR
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a
sdr_b
Exercise No 3: Estimating Wine Quality Step 2: Exploring and preparing the data
wine <- read.csv("whitewines.csv")
# examine the wine data
str(wine)
# the distribution of quality ratings
hist(wine$quality)
# summary statistics of the wine data
summary(wine)
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
Step 3: Training a model on the data
# regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
# get more detailed information about the tree
summary(m.rpart)
#the rpart package is used for Recursive Partitioning and Regression Trees. The main function, rpart(), is used to create classification and regression trees. It works by recursively splitting the dataset into subsets based on the predictor variables to create a decision tree.
#install.packages("rpart.plot")
# use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
#This function, from the rpart.plot package, plots a decision tree model (m.rpart) in an easy-to-interpret format.
#m.rpart is the fitted decision tree model created using rpart()
#digits = 3 Controls the number of decimal places displayed in the node labels.
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
Step 4: Evaluate model performance
# generate predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)
# compare the distribution of predicted values vs. actual values
summary(p.rpart)
summary(wine_test$quality)
# compare the correlation
cor(p.rpart, wine_test$quality)
# function to calculate the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
# mean absolute error between actual values and mean value
mean(wine_train$quality) # result = 5.87
MAE(5.87, wine_test$quality)
Step 5: Improving model performance
#install.packages("plyr")
#install.packages("Cubist")
# train a Cubist Model Tree
library(Cubist)
m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
# display basic information about the model tree
m.cubist
# display the tree itself
summary(m.cubist)
# generate predictions for the model
p.cubist <- predict(m.cubist, wine_test)
# summary statistics about the predictions
summary(p.cubist)
# correlation between the predicted and true values
cor(p.cubist, wine_test$quality)
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(wine_test$quality, p.cubist)