Understanding regression
## Example: Space Shuttle Launch Data
launch <- read.csv("challenger2.csv")
#view the dataset
launch
# estimate beta manually
b <- cov(launch$temperature, launch$distress_ct) / var(launch$temperature)
b
## [1] -0.03364796
#this variable explains the covariance between the dependent variable, distress, and the temperature or the independent variable. In this case, the 2 variables are indirectly correlated. The higher the temperature, the lower the distress.
#because the covariance is about 3%, this 2 variables are barely correlated. This means that the independent variable does not do a good job explaining the movement of the dependent variable
# estimate alpha manually
a <- mean(launch$distress_ct) - b * mean(launch$temperature)
a
## [1] 2.814585
#this is also the constant term. When the dependent variable is 0, meaning no distress. The temperature is 3 degrees.
# calculate the correlation of launch data
r <- cov(launch$temperature, launch$distress_ct) /
(sd(launch$temperature) * sd(launch$distress_ct))
r
## [1] -0.3359996
#The temperature and the distress of the rocket seems to be negatively correlated. This means that as the temperature increases, the higher the distress on the rocket, and the higher the chances of failure during launch.
#this is a more simple way to calculate the correlation using the formula "cor" in R.
cor(launch$temperature, launch$distress_ct)
## [1] -0.3359996
# computing the slope using correlation
r * (sd(launch$distress_ct) / sd(launch$temperature))
## [1] -0.03364796
#the slope is simply calculated by multiplying the standard deviation of each variable by the correlation variable
# confirming the regression line using the lm function (not in text)
model <- lm(distress_ct ~ temperature, data = launch)
model
##
## Call:
## lm(formula = distress_ct ~ temperature, data = launch)
##
## Coefficients:
## (Intercept) temperature
## 2.81458 -0.03365
#this creates our model y = mx + b -> y(distress) = 2.81 -0.03x
#this provides a summary of the coefficients and the interquartile range of the data points.
summary(model)
##
## Call:
## lm(formula = distress_ct ~ temperature, data = launch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0649 -0.4929 -0.2573 0.3052 1.7090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.81458 1.24629 2.258 0.0322 *
## temperature -0.03365 0.01815 -1.854 0.0747 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7076 on 27 degrees of freedom
## Multiple R-squared: 0.1129, Adjusted R-squared: 0.08004
## F-statistic: 3.436 on 1 and 27 DF, p-value: 0.07474
#some of the metrics mentioned below are R^2 which explains the goodness of fit of the feature(temperature) and the distress factor.
# because the p-value is above 0.05, we do not reject the null.
# creating a simple multiple regression function
#1. assign the value to the variable reg
#2. create the x value, add the features is a matrix format
#3. add a column called intercept with all the values set to 1
#4. Add another variable (beta = b): Here we need to transpose the data first and convert x into a squared matrix (x*x). Then we calculate the inverse of the matrix (x*x). Then we multiply the values from the matrix to our vector y. The result is the coefficient of the linear regression model.
reg <- function(y, x) {
x <- as.matrix(x)
x <- cbind(Intercept = 1, x)
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate"
print(b)
}
# examine the launch data
str(launch)
## 'data.frame': 29 obs. of 4 variables:
## $ distress_ct : int 0 1 0 0 0 0 0 0 1 1 ...
## $ temperature : int 66 70 69 68 67 72 73 70 57 63 ...
## $ field_check_pressure: int 50 50 50 50 50 50 100 100 200 200 ...
## $ flight_num : int 1 2 3 4 5 6 7 8 9 10 ...
# test regression model with simple linear regression
reg(y = launch$distress_ct, x = launch[2])
## estimate
## Intercept 2.81458456
## temperature -0.03364796
#here we deploy the formula that we jut created using our data set
# use regression model with multiple regression
reg(y = launch$distress_ct, x = launch[2:4])
## estimate
## Intercept 2.239817e+00
## temperature -3.124185e-02
## field_check_pressure -2.586765e-05
## flight_num 2.762455e-02
#we now use the same formula with the 4 features as in the data set
# confirming the multiple regression result using the lm function (not in text)
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num, data = launch)
model
##
## Call:
## lm(formula = distress_ct ~ temperature + field_check_pressure +
## flight_num, data = launch)
##
## Coefficients:
## (Intercept) temperature field_check_pressure
## 2.240e+00 -3.124e-02 -2.587e-05
## flight_num
## 2.762e-02
#now we aggregate the features to create our final multiple linear regression model