library(dplyr)

data("women")
head(women)
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
library(ggplot2)

ggplot(women, aes(x=height, y=weight))+
  geom_point(size=2, col="darkblue")+
  xlab("Chieu cao(inches)")+
  ylab("Can nang(pounds)")+
  ggtitle("Bieu do hoi quy chieu cao can nang nu gioi")+
  theme_minimal()

#Xay dung mo hinh hoi quy tuyen tinh
fit<-lm(weight ~ height, data=women)
summary(fit)
## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14
#residuals: dl thuc-dl du doan=sai so
#=> chieu cao anh huong tuyeb tinh duong va co y nghia thong ke den can nang
#=>pt hoi quy co dang: weight=-87.51667+3.45*height
fitted(fit)    #tra ve vecto so, moi ptu la 1 quan sat, dua vao ct pt hoi quy ben tren, tu chieu cao=>can nang(du doan, ko phai real)
##        1        2        3        4        5        6        7        8 
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333 
##        9       10       11       12       13       14       15 
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
women$prediction_value<-fitted(fit)
women$residual_value<-residuals(fit)
women
##    height weight prediction_value residual_value
## 1      58    115         112.5833     2.41666667
## 2      59    117         116.0333     0.96666667
## 3      60    120         119.4833     0.51666667
## 4      61    123         122.9333     0.06666667
## 5      62    126         126.3833    -0.38333333
## 6      63    129         129.8333    -0.83333333
## 7      64    132         133.2833    -1.28333333
## 8      65    135         136.7333    -1.73333333
## 9      66    139         140.1833    -1.18333333
## 10     67    142         143.6333    -1.63333333
## 11     68    146         147.0833    -1.08333333
## 12     69    150         150.5333    -0.53333333
## 13     70    154         153.9833     0.01666667
## 14     71    159         157.4333     1.56666667
## 15     72    164         160.8833     3.11666667
ggplot(women, aes(x=height, y=weight))+
  geom_point(aes(x=height, y=weight), col="red", size=2, alpha=0.8)+ #Can nang thuc te
  geom_point(aes(x=height, y=prediction_value), col="darkblue", size=2, alpha=0.8)+ #Can nang du doan
  xlab("Chieu cao(inches)")+
  ylab("Can nang(pounds)")+
  ggtitle("Bieu do can nang thuc te va can nang du doan tu chieu cao nu gioi")+
  theme_minimal()

#ve duong hoi quy
ggplot(data=women, aes(x=height, y=weight))+
  geom_point(size=2,  col="red")+
  geom_smooth(method = "lm", se=TRUE)+
  xlab("Chieu cao(inches)")+
  ylab("Can nang(pounds)")+
  ggtitle("Scatter plot with fitted regression line")+
  theme_minimal()

library(PerformanceAnalytics)

states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")])

#Phan tich tuong quan 
chart.Correlation(states, histogram=TRUE) #bieu do ma tran tuong quan

#=> tie le giet nguoi co moi quan he tuong quan duong voi ti le mu chu; tuong quan voi thu nhap va so ngay bang gia, ko the hien mqh tuyen tinh ro rang voi dan so
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
summary(fit)
## 
## Call:
## lm(formula = Murder ~ Population + Illiteracy + Income + Frost, 
##     data = states)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7960 -1.6495 -0.0811  1.4815  7.6210 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.235e+00  3.866e+00   0.319   0.7510    
## Population  2.237e-04  9.052e-05   2.471   0.0173 *  
## Illiteracy  4.143e+00  8.744e-01   4.738 2.19e-05 ***
## Income      6.442e-05  6.837e-04   0.094   0.9253    
## Frost       5.813e-04  1.005e-02   0.058   0.9541    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.535 on 45 degrees of freedom
## Multiple R-squared:  0.567,  Adjusted R-squared:  0.5285 
## F-statistic: 14.73 on 4 and 45 DF,  p-value: 9.133e-08
#=>duong hoi quy tt co dang: murder=1.235+2.237e-04*Population+4.143e+00*Illiteracy+6.442e-05*Income+5.813e-04*Frost
#=>mu chu & dan so anh huong max 
states$pre <- fitted(fit)
states$res <- residuals(fit)

#Khoang tin cay(danh gia xem hs hoi quy !=0?)
#khoang co 0=>ko care
#khoang ko co 0=>care
confint(fit)
##                     2.5 %       97.5 %
## (Intercept) -6.552191e+00 9.0213182149
## Population   4.136397e-05 0.0004059867
## Illiteracy   2.381799e+00 5.9038743192
## Income      -1.312611e-03 0.0014414600
## Frost       -1.966781e-02 0.0208304170
#Bieu do phan du
par(mfrow=c(2,2))
plot(fit)

#bd1: tuong quan phan du & kq du bao, phan du quanh muc 0=>kq tot =>ti le du bao qua thap so voi thuc te tai 2 bang Rhode Island vĂ  Masschusetts, qua cao tai bang Nevada

#bd2: ktra phan du co phan phoi chuan ko=> phan du 3 bang ko theo pp chuan

#bd3: phuong sai cua phan du co dong nhat?

#bd4: outliers cua phan du