library(dplyr)
data("women")
head(women)
## height weight
## 1 58 115
## 2 59 117
## 3 60 120
## 4 61 123
## 5 62 126
## 6 63 129
library(ggplot2)
ggplot(women, aes(x=height, y=weight))+
geom_point(size=2, col="darkblue")+
xlab("Chieu cao(inches)")+
ylab("Can nang(pounds)")+
ggtitle("Bieu do hoi quy chieu cao can nang nu gioi")+
theme_minimal()

#Xay dung mo hinh hoi quy tuyen tinh
fit<-lm(weight ~ height, data=women)
summary(fit)
##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
#residuals: dl thuc-dl du doan=sai so
#=> chieu cao anh huong tuyeb tinh duong va co y nghia thong ke den can nang
#=>pt hoi quy co dang: weight=-87.51667+3.45*height
fitted(fit) #tra ve vecto so, moi ptu la 1 quan sat, dua vao ct pt hoi quy ben tren, tu chieu cao=>can nang(du doan, ko phai real)
## 1 2 3 4 5 6 7 8
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333
## 9 10 11 12 13 14 15
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
women$prediction_value<-fitted(fit)
women$residual_value<-residuals(fit)
women
## height weight prediction_value residual_value
## 1 58 115 112.5833 2.41666667
## 2 59 117 116.0333 0.96666667
## 3 60 120 119.4833 0.51666667
## 4 61 123 122.9333 0.06666667
## 5 62 126 126.3833 -0.38333333
## 6 63 129 129.8333 -0.83333333
## 7 64 132 133.2833 -1.28333333
## 8 65 135 136.7333 -1.73333333
## 9 66 139 140.1833 -1.18333333
## 10 67 142 143.6333 -1.63333333
## 11 68 146 147.0833 -1.08333333
## 12 69 150 150.5333 -0.53333333
## 13 70 154 153.9833 0.01666667
## 14 71 159 157.4333 1.56666667
## 15 72 164 160.8833 3.11666667
ggplot(women, aes(x=height, y=weight))+
geom_point(aes(x=height, y=weight), col="red", size=2, alpha=0.8)+ #Can nang thuc te
geom_point(aes(x=height, y=prediction_value), col="darkblue", size=2, alpha=0.8)+ #Can nang du doan
xlab("Chieu cao(inches)")+
ylab("Can nang(pounds)")+
ggtitle("Bieu do can nang thuc te va can nang du doan tu chieu cao nu gioi")+
theme_minimal()

#ve duong hoi quy
ggplot(data=women, aes(x=height, y=weight))+
geom_point(size=2, col="red")+
geom_smooth(method = "lm", se=TRUE)+
xlab("Chieu cao(inches)")+
ylab("Can nang(pounds)")+
ggtitle("Scatter plot with fitted regression line")+
theme_minimal()

library(PerformanceAnalytics)
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])
#Phan tich tuong quan
chart.Correlation(states, histogram=TRUE) #bieu do ma tran tuong quan

#=> tie le giet nguoi co moi quan he tuong quan duong voi ti le mu chu; tuong quan voi thu nhap va so ngay bang gia, ko the hien mqh tuyen tinh ro rang voi dan so
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
summary(fit)
##
## Call:
## lm(formula = Murder ~ Population + Illiteracy + Income + Frost,
## data = states)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7960 -1.6495 -0.0811 1.4815 7.6210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.235e+00 3.866e+00 0.319 0.7510
## Population 2.237e-04 9.052e-05 2.471 0.0173 *
## Illiteracy 4.143e+00 8.744e-01 4.738 2.19e-05 ***
## Income 6.442e-05 6.837e-04 0.094 0.9253
## Frost 5.813e-04 1.005e-02 0.058 0.9541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.535 on 45 degrees of freedom
## Multiple R-squared: 0.567, Adjusted R-squared: 0.5285
## F-statistic: 14.73 on 4 and 45 DF, p-value: 9.133e-08
#=>duong hoi quy tt co dang: murder=1.235+2.237e-04*Population+4.143e+00*Illiteracy+6.442e-05*Income+5.813e-04*Frost
#=>mu chu & dan so anh huong max
states$pre <- fitted(fit)
states$res <- residuals(fit)
#Khoang tin cay(danh gia xem hs hoi quy !=0?)
#khoang co 0=>ko care
#khoang ko co 0=>care
confint(fit)
## 2.5 % 97.5 %
## (Intercept) -6.552191e+00 9.0213182149
## Population 4.136397e-05 0.0004059867
## Illiteracy 2.381799e+00 5.9038743192
## Income -1.312611e-03 0.0014414600
## Frost -1.966781e-02 0.0208304170
#Bieu do phan du
par(mfrow=c(2,2))
plot(fit)

#bd1: tuong quan phan du & kq du bao, phan du quanh muc 0=>kq tot =>ti le du bao qua thap so voi thuc te tai 2 bang Rhode Island vĂ Masschusetts, qua cao tai bang Nevada
#bd2: ktra phan du co phan phoi chuan ko=> phan du 3 bang ko theo pp chuan
#bd3: phuong sai cua phan du co dong nhat?
#bd4: outliers cua phan du