library(datasets)
df <- as.data.frame(Titanic)
df
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
## 7 3rd Female Child No 17
## 8 Crew Female Child No 0
## 9 1st Male Adult No 118
## 10 2nd Male Adult No 154
## 11 3rd Male Adult No 387
## 12 Crew Male Adult No 670
## 13 1st Female Adult No 4
## 14 2nd Female Adult No 13
## 15 3rd Female Adult No 89
## 16 Crew Female Adult No 3
## 17 1st Male Child Yes 5
## 18 2nd Male Child Yes 11
## 19 3rd Male Child Yes 13
## 20 Crew Male Child Yes 0
## 21 1st Female Child Yes 1
## 22 2nd Female Child Yes 13
## 23 3rd Female Child Yes 14
## 24 Crew Female Child Yes 0
## 25 1st Male Adult Yes 57
## 26 2nd Male Adult Yes 14
## 27 3rd Male Adult Yes 75
## 28 Crew Male Adult Yes 192
## 29 1st Female Adult Yes 140
## 30 2nd Female Adult Yes 80
## 31 3rd Female Adult Yes 76
## 32 Crew Female Adult Yes 20
str(df)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
Dependent Variable: Survived Independent Variable Class, Sex, Age *** yi = beta0 + beta1x1i + beta2x2i + beta3x3i + errori y = beta0 + beta1x1 + beta2x2 + beta3x3 + error***
df$Class <- as.numeric(df$Class)
df$Sex <- as.numeric(df$Sex)
df$Age <- as.numeric(df$Age)
df$Survived <- as.numeric(df$Survived)
model1 <- lm(Survived ~ Class + Sex + Age, df)
summary(model1)
##
## Call:
## lm(formula = Survived ~ Class + Sex + Age, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5 -0.5 0.0 0.5 0.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.500e+00 4.629e-01 3.24 0.00307 **
## Class 1.755e-17 8.452e-02 0.00 1.00000
## Sex 1.963e-16 1.890e-01 0.00 1.00000
## Age 1.963e-16 1.890e-01 0.00 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5345 on 28 degrees of freedom
## Multiple R-squared: 1.787e-31, Adjusted R-squared: -0.1071
## F-statistic: 1.668e-30 on 3 and 28 DF, p-value: 1
*** Intercept 1.5 and it is statistically significant Slope for Class 1.755e-17, positive relationship but it almost zero meaning almost no linear relationship between these two variables Slope for Sex 1.963e-16, same thing, positive but almost no linear relationship 2.748e-16 , Same thing I found it weird and checked it carefully. I think the reason is because of the binary outcomes of Class, Sex and Age. When we convert them into numbers 0, 1 , it will be hard to generate a linear relationship I donโt want to change a dataset, I want to show my outcomes to the discussion and hope this helps***
df$Class <- as.numeric(df$Class)
df$Survived <- as.numeric(df$Survived)
model2 <- lm(Survived ~ Class, df)
summary(model2)
##
## Call:
## lm(formula = Survived ~ Class, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5 -0.5 0.0 0.5 0.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.500e+00 2.236e-01 6.708 1.96e-07 ***
## Class 1.755e-17 8.165e-02 0.000 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5164 on 30 degrees of freedom
## Multiple R-squared: 3.081e-32, Adjusted R-squared: -0.03333
## F-statistic: 9.244e-31 on 1 and 30 DF, p-value: 1
beta1 <- cov(df$Class, df$Survived) / var(df$Class)
beta0 <- mean(df$Survived) - beta1 * mean(df$Class)
beta0
## [1] 1.5
beta1
## [1] 0
*** use car dataset***
df2 <- as.data.frame(cars)
df2
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## 7 10 18
## 8 10 26
## 9 10 34
## 10 11 17
## 11 11 28
## 12 12 14
## 13 12 20
## 14 12 24
## 15 12 28
## 16 13 26
## 17 13 34
## 18 13 34
## 19 13 46
## 20 14 26
## 21 14 36
## 22 14 60
## 23 14 80
## 24 15 20
## 25 15 26
## 26 15 54
## 27 16 32
## 28 16 40
## 29 17 32
## 30 17 40
## 31 17 50
## 32 18 42
## 33 18 56
## 34 18 76
## 35 18 84
## 36 19 36
## 37 19 46
## 38 19 68
## 39 20 32
## 40 20 48
## 41 20 52
## 42 20 56
## 43 20 64
## 44 22 66
## 45 23 54
## 46 24 70
## 47 24 92
## 48 24 93
## 49 24 120
## 50 25 85
str(df2)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
model3 <- lm(dist ~ speed, df2)
summary(model3)
##
## Call:
## lm(formula = dist ~ speed, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
beta0 = -17.5791, statistically significant with 3 beta1 = 3.9324, positive relationship, statistically significant with 3 y = -17.5701 + 3.9324x y distance, x speed
beta1 <- cov(df2$speed, df2$dist) / var(df2$speed)
beta0 <- mean(df2$dist) - beta1 * mean(df2$speed)
beta0
## [1] -17.57909
beta1
## [1] 3.932409