Regression

curl::curl_download('https://raw.githubusercontent.com/ywchiu/fda_course/main/cdc.Rdata', 'cdc.Rdata')

load("cdc.Rdata")
summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00
head(cdc)
numeric_data <- cdc[,c(2,3,4,5,6,7)]

co<- cor(numeric_data)
heatmap(co)

plot( wtdesire ~ weight, data = numeric_data )
fit <- lm(wtdesire~ weight, data = numeric_data)
fit
## 
## Call:
## lm(formula = wtdesire ~ weight, data = numeric_data)
## 
## Coefficients:
## (Intercept)       weight  
##      46.664        0.639
plot(wtdesire~ weight, data = numeric_data)
abline(fit, col ='red')

summary(fit)
## 
## Call:
## lm(formula = wtdesire ~ weight, data = numeric_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -167.98   -9.32    0.08   11.51  518.31 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.664015   0.590782   78.99   <2e-16 ***
## weight       0.639014   0.003388  188.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.21 on 19998 degrees of freedom
## Multiple R-squared:  0.6401, Adjusted R-squared:  0.6401 
## F-statistic: 3.556e+04 on 1 and 19998 DF,  p-value: < 2.2e-16

House Price Prediction

library(readr)
## Warning: 套件 'readr' 是用 R 版本 4.1.3 來建造的
rent_591_sample <- read_csv("https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/rent_591_sample.csv")
## New names:
## * `` -> ...1
## Rows: 720 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): address, layout
## dbl (6): ...1, id, floor, allfloor, area, browsenum_all
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(rent_591_sample)
?strsplit
## starting httpd help server ... done
#rent_591_sample$layout

#rent_591_sample

fit <- lm(price ~ floor + allfloor + area, data = rent_591_sample)

fit
## 
## Call:
## lm(formula = price ~ floor + allfloor + area, data = rent_591_sample)
## 
## Coefficients:
## (Intercept)        floor     allfloor         area  
##      2979.6        322.6        110.1       1342.1
summary(fit)
## 
## Call:
## lm(formula = price ~ floor + allfloor + area, data = rent_591_sample)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84167 -11230  -1629   8622 186182 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2979.65    2402.81   1.240    0.215    
## floor         322.58      81.25   3.970 7.91e-05 ***
## allfloor      110.11     180.85   0.609    0.543    
## area         1342.11      36.38  36.894  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22770 on 716 degrees of freedom
## Multiple R-squared:  0.677,  Adjusted R-squared:  0.6756 
## F-statistic: 500.2 on 3 and 716 DF,  p-value: < 2.2e-16

分類模型

data(iris)
plot(iris$Petal.Length~ iris$Petal.Width, col =iris$Species)

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris<- iris[(iris$Species!= 'setosa'), ]

iris$Species <- factor(iris$Species)

fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, family = binomial)

fit
## 
## Call:  glm(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     Petal.Width, family = binomial, data = iris)
## 
## Coefficients:
##  (Intercept)  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width  
##      -42.638        -2.465        -6.681         9.429        18.286  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  95 Residual
## Null Deviance:       138.6 
## Residual Deviance: 11.9  AIC: 21.9
summary(fit)
## 
## Call:
## glm(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     Petal.Width, family = binomial, data = iris)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01105  -0.00541  -0.00001   0.00677   1.78065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -42.638     25.707  -1.659   0.0972 .
## Sepal.Length   -2.465      2.394  -1.030   0.3032  
## Sepal.Width    -6.681      4.480  -1.491   0.1359  
## Petal.Length    9.429      4.737   1.991   0.0465 *
## Petal.Width    18.286      9.743   1.877   0.0605 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.629  on 99  degrees of freedom
## Residual deviance:  11.899  on 95  degrees of freedom
## AIC: 21.899
## 
## Number of Fisher Scoring iterations: 10
X <- iris[seq(1,3), ]

predicted_y <- predict(fit, X, type ='response') 
predicted_y > 0.5
##    51    52    53 
## FALSE FALSE FALSE
predicted_y <- predict(fit, iris, type ='response') 


y <- iris$Species == 'virginica'
y_ <- predicted_y > 0.5


length(y_)
## [1] 100
sum(y_ == y) / length(y_)
## [1] 0.98
table(y, y_)
##        y_
## y       FALSE TRUE
##   FALSE    49    1
##   TRUE      1   49
set.seed(42)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob = c(0.7,0.3))

train_data <- iris[idx == 1, ]
test_data <- iris[idx == 2, ]

nrow(train_data)
## [1] 66
nrow(test_data)
## [1] 34
fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = train_data, family = binomial)

predicted_y <- predict(fit, test_data, type ='response') 
y <- test_data$Species == 'virginica'


sum((predicted_y > 0.5) == y) / length(y)
## [1] 0.9411765
predicted_y2 <- predict(fit, train_data, type ='response') 
y2 <- train_data$Species == 'virginica'
sum((predicted_y2 > 0.5) == y2) / length(y2)
## [1] 0.969697

Cross Validation

library(caret)
## Warning: 套件 'caret' 是用 R 版本 4.1.3 來建造的
## 載入需要的套件:ggplot2
## Warning: 套件 'ggplot2' 是用 R 版本 4.1.3 來建造的
## 載入需要的套件:lattice
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Species~., data=iris, method = "glm",
family = "binomial", trControl=control)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model
## Generalized Linear Model 
## 
## 100 samples
##   4 predictor
##   2 classes: 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 90, 90, 90, 90, 90, 90, ... 
## Resampling results:
## 
##   Accuracy  Kappa
##   0.96      0.92

## ROC

fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, family = binomial)



predicted_y <- predict(fit, iris, type ='response') 
predicted_y <- predicted_y > 0.8
y <- iris$Species == 'virginica'
table(predicted_y, y)
##            y
## predicted_y FALSE TRUE
##       FALSE    49    2
##       TRUE      1   48
library(ROCR)
## Warning: 套件 'ROCR' 是用 R 版本 4.1.3 來建造的
predictions <- predict(fit, iris, type="response")
y <- iris$Species == 'virginica'
pred.rocr <- prediction(predictions,y)
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))