dim(mtcars)
## [1] 32 11
head(mtcars,10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
tail(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.7 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.5 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.5 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.6 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.6 1 1 4 2
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
quantile(mtcars$cyl)
## 0% 25% 50% 75% 100%
## 4 4 6 8 8
mpg: tiêu thụ nhiên liệu (dặm/gallon), cyl: số xy lanh, disp, hp: công suất, drat: tỷ số xoắn.
plot(mtcars)
summary(mtcars$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.40 15.43 19.20 20.09 22.80 33.90
Desc(mtcars$mpg)
## ------------------------------------------------------------------------------
## mtcars$mpg (numeric)
##
## length n NAs unique 0s mean meanCI'
## 32 32 0 25 0 20.091 17.918
## 100.0% 0.0% 0.0% 22.264
##
## .05 .10 .25 median .75 .90 .95
## 11.995 14.340 15.425 19.200 22.800 30.090 31.300
##
## range sd vcoef mad IQR skew kurt
## 23.500 6.027 0.300 5.411 7.375 0.611 -0.373
##
## lowest : 10.4 (2), 13.3, 14.3, 14.7, 15.0
## highest: 26.0, 27.3, 30.4 (2), 32.4, 33.9
##
## ' 95%-CI (classic)
hist(mtcars$mpg, col="lightgreen")
boxplot(mtcars$mpg, col="green")
ggplot(mtcars, aes(x = hp,y = mpg)) +
geom_point(color = "red", size = 2, alpha = 0.6) +
labs(title = "Miles/(US) gallon and Gross horsepower",
subtitle = "1981", x = "hp", y = "mpg")+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mtcars, aes(x = cyl,y = mpg)) +
geom_point(color = "red", size = 2, alpha = 0.6)+
labs(title = "Miles/(US) gallon and Number of cylinders",
subtitle = "1981", x = "Number of cylinders", y = "Miles/(US) gallon")+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
mtcars$Am[mtcars$am == 0] <-"Automatic"
mtcars$Am[mtcars$am == 1] <-"Manual"
ggplot(mtcars, aes(x = Am,y = mpg)) +
geom_violin(fill = "cornflowerblue") +
geom_boxplot(width = 0.4, col = "red")+
labs(title = "Miles/(US) gallon and Transmission",
subtitle = "1981", x = "Transmission", y = "Miles/(US) gallon")
boxplot(mpg ~ am, data = mtcars, names = c("Automatic", "Manunal"), col= "lightgreen")
t.test(mtcars$mpg ~ mtcars$am)
##
## Welch Two Sample t-test
##
## data: mtcars$mpg by mtcars$am
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.280194 -3.209684
## sample estimates:
## mean in group 0 mean in group 1
## 17.14737 24.39231
mtcars$cyl1[mtcars$cyl==4]="4"
mtcars$cyl1[mtcars$cyl==6]="6"
mtcars$cyl1[mtcars$cyl==8]="8"
Desc(mtcars$mpg ~ mtcars$cyl1)
## ------------------------------------------------------------------------------
## mtcars$mpg ~ mtcars$cyl1
##
## Summary:
## n pairs: 32, valid: 32 (100.0%), missings: 0 (0.0%), groups: 3
##
##
## 4 6 8
## mean 26.664 19.743 15.100
## median 26.000 19.700 15.200
## sd 4.510 1.454 2.560
## IQR 7.600 2.350 1.850
## n 11 7 14
## np 34.375% 21.875% 43.750%
## NAs 0 0 0
## 0s 0 0 0
##
## Kruskal-Wallis rank sum test:
## Kruskal-Wallis chi-squared = 25.746, df = 2, p-value = 2.566e-06
### Phân tích mô tả và so sánh biến liên tục mpg theo biến Am
(t-test)
Desc(mtcars$mpg ~ mtcars$Am)
## ------------------------------------------------------------------------------
## mtcars$mpg ~ mtcars$Am
##
## Summary:
## n pairs: 32, valid: 32 (100.0%), missings: 0 (0.0%), groups: 2
##
##
## Automatic Manual
## mean 17.147 24.392
## median 17.300 22.800
## sd 3.834 6.167
## IQR 4.250 9.400
## n 19 13
## np 59.375% 40.625%
## NAs 0 0
## 0s 0 0
##
## Kruskal-Wallis rank sum test:
## Kruskal-Wallis chi-squared = 9.7914, df = 1, p-value = 0.001753
shapiro.test(mtcars$mpg)
##
## Shapiro-Wilk normality test
##
## data: mtcars$mpg
## W = 0.94756, p-value = 0.1229
set1 = subset(mtcars, am = 0)
set2 = subset(mtcars, am = 1)
shapiro.test(set1$mpg)
##
## Shapiro-Wilk normality test
##
## data: set1$mpg
## W = 0.94756, p-value = 0.1229
shapiro.test(set2$mpg)
##
## Shapiro-Wilk normality test
##
## data: set2$mpg
## W = 0.94756, p-value = 0.1229
t.test(mtcars$mpg~mtcars$Am, var.equal=TRUE )
##
## Two Sample t-test
##
## data: mtcars$mpg by mtcars$Am
## t = -4.1061, df = 30, p-value = 0.000285
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.84837 -3.64151
## sample estimates:
## mean in group Automatic mean in group Manual
## 17.14737 24.39231
ggplot(mtcars, aes(x = wt,y = mpg)) +
geom_point(color = "red", size = 2, alpha = 0.6)+
labs(title = "Miles/(US) gallon and Weight",
subtitle = "1981", x = "Weight", y = "Miles/(US) gallon")+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(mtcars, aes(x = Am,y = mpg)) +
geom_violin(fill = "cornflowerblue") +
geom_boxplot(width = 0.4, col = "red")+
labs(title = "Miles/(US) gallon and Transmission",
subtitle = "1981", x = "Transmission", y = "Miles/(US) gallon")
## 5. Phân tích tương quan hồi quy ### Mô hình hồi quy full
cars = mtcars
cars$vs = as.factor(cars$vs)
cars$vam = as.factor(cars$am)
full_model = lm(mpg ~ ., data = cars)
summary(full_model)
##
## Call:
## lm(formula = mpg ~ ., data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4734 -1.3794 -0.0655 1.0510 4.3906
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.18240 18.42852 0.878 0.3903
## cyl 0.40936 1.07893 0.379 0.7084
## disp 0.01391 0.01740 0.799 0.4334
## hp -0.04613 0.02712 -1.701 0.1045
## drat 0.02635 1.67649 0.016 0.9876
## wt -3.80625 1.84664 -2.061 0.0525 .
## qsec 0.64696 0.72195 0.896 0.3808
## vs1 1.74739 2.27267 0.769 0.4510
## am 2.61727 2.00475 1.306 0.2065
## gear 0.76403 1.45668 0.525 0.6057
## carb 0.50935 0.94244 0.540 0.5948
## AmManual NA NA NA NA
## cyl16 -2.47903 1.70028 -1.458 0.1604
## cyl18 NA NA NA NA
## vam1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.582 on 20 degrees of freedom
## Multiple R-squared: 0.8816, Adjusted R-squared: 0.8165
## F-statistic: 13.54 on 11 and 20 DF, p-value: 5.722e-07
cor.test(mtcars$mpg,mtcars$wt)
##
## Pearson's product-moment correlation
##
## data: mtcars$mpg and mtcars$wt
## t = -9.559, df = 30, p-value = 1.294e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9338264 -0.7440872
## sample estimates:
## cor
## -0.8676594
wt1 = lm(mtcars$mpg ~ mtcars$wt)
summary(wt1)
##
## Call:
## lm(formula = mtcars$mpg ~ mtcars$wt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## mtcars$wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
par(mfrow = c(2,2))
plot(wt1)
am1 = lm(mpg ~ Am, data = mtcars)
summary(am1)
##
## Call:
## lm(formula = mpg ~ Am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3923 -3.0923 -0.2974 3.2439 9.5077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.147 1.125 15.247 1.13e-15 ***
## AmManual 7.245 1.764 4.106 0.000285 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.902 on 30 degrees of freedom
## Multiple R-squared: 0.3598, Adjusted R-squared: 0.3385
## F-statistic: 16.86 on 1 and 30 DF, p-value: 0.000285
#Phương trình: mpg = 17.15 + 7.25*am(Manual)
#Diễn giải: +) Manual có mpg cao hơn Automatic 7.25 (SE ~ 1.76), và sự khác biệt này có ý nghĩa thống kê (P = 0.000285 < 0.001)
# +) am giải thích 36% sự khác biệt về mpg
par(mfrow = c(2,2)) # chia 2 dòng 2 cột
plot(am1)
library(corrplot)
## corrplot 0.92 loaded
dat = mtcars[, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")]
cor(dat)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
pairs.panels(dat)
corrplot(cor(dat), type = "upper", method = "number")
#library(GGally)
#ggpairs(dat)
lm1 = lm(mpg ~ cyl + disp + wt, data = mtcars)
summary(lm1)
##
## Call:
## lm(formula = mpg ~ cyl + disp + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4035 -1.4028 -0.4955 1.3387 6.0722
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.107678 2.842426 14.462 1.62e-14 ***
## cyl -1.784944 0.607110 -2.940 0.00651 **
## disp 0.007473 0.011845 0.631 0.53322
## wt -3.635677 1.040138 -3.495 0.00160 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.595 on 28 degrees of freedom
## Multiple R-squared: 0.8326, Adjusted R-squared: 0.8147
## F-statistic: 46.42 on 3 and 28 DF, p-value: 5.399e-11
yvar = mtcars[, ("mpg")]
xvars = mtcars[, c("cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")]
bma = bicreg(xvars, yvar, strict = FALSE, OR = 20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 86 models were selected
## Best 5 models (cumulative posterior probability = 0.2457 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 26.3181386 13.324900 9.61778 39.68626 37.22727
## cyl 37.8 -0.4285797 0.685863 . -1.50779 .
## disp 12.5 0.0004793 0.005408 . . .
## hp 41.1 -0.0104394 0.015618 . . -0.03177
## drat 15.4 0.1988568 0.739629 . . .
## wt 98.8 -3.5386668 1.121556 -3.91650 -3.19097 -3.87783
## qsec 48.8 0.4506877 0.568549 1.22589 . .
## vs 10.6 0.0878027 0.645424 . . .
## am 40.1 1.0835917 1.688212 2.93584 . .
## gear 13.1 0.0964727 0.514582 . . .
## carb 22.9 -0.1294021 0.342176 . . .
##
## nVar 3 2 2
## r2 0.850 0.830 0.827
## BIC -50.23818 -49.81447 -49.17255
## post prob 0.071 0.057 0.042
## model 4 model 5
## Intercept 19.74622 38.75179
## cyl . -0.94162
## disp . .
## hp . -0.01804
## drat . .
## wt -5.04798 -3.16697
## qsec 0.92920 .
## vs . .
## am . .
## gear . .
## carb . .
##
## nVar 2 3
## r2 0.826 0.843
## BIC -49.10426 -48.88168
## post prob 0.040 0.036
best1 = lm(mpg ~ wt + qsec + am, data = mtcars)
summary(best1)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
par(mfrow = c(1,3)) # chia 1 dòng 2 cột
visreg(best1, xvar = "wt", gg = T, xlab = "Wt", ylab = "Mpg")
visreg(best1, xvar = "qsec", gg = T, xlab = "Qsec", ylab = "Mpg")
visreg(best1, xvar = "am", gg = T, xlab = "AM", ylab = "Mpg")
par(mfrow = c(2,2))
plot(best1)
best2 = lm(mpg ~ wt + qsec, data = mtcars)
summary(best2)
##
## Call:
## lm(formula = mpg ~ wt + qsec, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3962 -2.1431 -0.2129 1.4915 5.7486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.7462 5.2521 3.760 0.000765 ***
## wt -5.0480 0.4840 -10.430 2.52e-11 ***
## qsec 0.9292 0.2650 3.506 0.001500 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.596 on 29 degrees of freedom
## Multiple R-squared: 0.8264, Adjusted R-squared: 0.8144
## F-statistic: 69.03 on 2 and 29 DF, p-value: 9.395e-12
par(mfrow = c(1,2)) # chia 1 dòng 2 cột
visreg(best2, xvar = "wt", gg = T, xlab = "Wt", ylab = "Mpg")
visreg(best2, xvar = "qsec", gg = T, xlab = "Qsec", ylab = "Mpg")
par(mfrow = c(2,2))
plot(best2)