curl::curl_download('https://raw.githubusercontent.com/ywchiu/fda_course/main/cdc.Rdata', 'cdc.Rdata')
load("cdc.Rdata")
summary(cdc)
## genhlth exerany hlthplan smoke100
## excellent:4657 Min. :0.0000 Min. :0.0000 Min. :0.0000
## very good:6972 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## good :5675 Median :1.0000 Median :1.0000 Median :0.0000
## fair :2019 Mean :0.7457 Mean :0.8738 Mean :0.4721
## poor : 677 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## height weight wtdesire age gender
## Min. :48.00 Min. : 68.0 Min. : 68.0 Min. :18.00 m: 9569
## 1st Qu.:64.00 1st Qu.:140.0 1st Qu.:130.0 1st Qu.:31.00 f:10431
## Median :67.00 Median :165.0 Median :150.0 Median :43.00
## Mean :67.18 Mean :169.7 Mean :155.1 Mean :45.07
## 3rd Qu.:70.00 3rd Qu.:190.0 3rd Qu.:175.0 3rd Qu.:57.00
## Max. :93.00 Max. :500.0 Max. :680.0 Max. :99.00
head(cdc)
numeric_data <- cdc[,c(2,3,4,5,6,7)]
co<- cor(numeric_data)
heatmap(co)
plot( wtdesire ~ weight, data = numeric_data )
fit <- lm(wtdesire~ weight, data = numeric_data)
fit
##
## Call:
## lm(formula = wtdesire ~ weight, data = numeric_data)
##
## Coefficients:
## (Intercept) weight
## 46.664 0.639
plot(wtdesire~ weight, data = numeric_data)
abline(fit, col ='red')
summary(fit)
##
## Call:
## lm(formula = wtdesire ~ weight, data = numeric_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.98 -9.32 0.08 11.51 518.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.664015 0.590782 78.99 <2e-16 ***
## weight 0.639014 0.003388 188.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.21 on 19998 degrees of freedom
## Multiple R-squared: 0.6401, Adjusted R-squared: 0.6401
## F-statistic: 3.556e+04 on 1 and 19998 DF, p-value: < 2.2e-16
library(readr)
## Warning: 套件 'readr' 是用 R 版本 4.1.3 來建造的
rent_591_sample <- read_csv("https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/rent_591_sample.csv")
## New names:
## * `` -> ...1
## Rows: 720 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): address, layout
## dbl (6): ...1, id, floor, allfloor, area, browsenum_all
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(rent_591_sample)
?strsplit
## starting httpd help server ... done
#rent_591_sample$layout
#rent_591_sample
fit <- lm(price ~ floor + allfloor + area, data = rent_591_sample)
fit
##
## Call:
## lm(formula = price ~ floor + allfloor + area, data = rent_591_sample)
##
## Coefficients:
## (Intercept) floor allfloor area
## 2979.6 322.6 110.1 1342.1
summary(fit)
##
## Call:
## lm(formula = price ~ floor + allfloor + area, data = rent_591_sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84167 -11230 -1629 8622 186182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2979.65 2402.81 1.240 0.215
## floor 322.58 81.25 3.970 7.91e-05 ***
## allfloor 110.11 180.85 0.609 0.543
## area 1342.11 36.38 36.894 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22770 on 716 degrees of freedom
## Multiple R-squared: 0.677, Adjusted R-squared: 0.6756
## F-statistic: 500.2 on 3 and 716 DF, p-value: < 2.2e-16
data(iris)
plot(iris$Petal.Length~ iris$Petal.Width, col =iris$Species)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris<- iris[(iris$Species!= 'setosa'), ]
iris$Species <- factor(iris$Species)
fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, family = binomial)
fit
##
## Call: glm(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
## Petal.Width, family = binomial, data = iris)
##
## Coefficients:
## (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
## -42.638 -2.465 -6.681 9.429 18.286
##
## Degrees of Freedom: 99 Total (i.e. Null); 95 Residual
## Null Deviance: 138.6
## Residual Deviance: 11.9 AIC: 21.9
summary(fit)
##
## Call:
## glm(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
## Petal.Width, family = binomial, data = iris)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01105 -0.00541 -0.00001 0.00677 1.78065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.638 25.707 -1.659 0.0972 .
## Sepal.Length -2.465 2.394 -1.030 0.3032
## Sepal.Width -6.681 4.480 -1.491 0.1359
## Petal.Length 9.429 4.737 1.991 0.0465 *
## Petal.Width 18.286 9.743 1.877 0.0605 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.629 on 99 degrees of freedom
## Residual deviance: 11.899 on 95 degrees of freedom
## AIC: 21.899
##
## Number of Fisher Scoring iterations: 10
X <- iris[seq(1,3), ]
predicted_y <- predict(fit, X, type ='response')
predicted_y > 0.5
## 51 52 53
## FALSE FALSE FALSE
predicted_y <- predict(fit, iris, type ='response')
y <- iris$Species == 'virginica'
y_ <- predicted_y > 0.5
length(y_)
## [1] 100
sum(y_ == y) / length(y_)
## [1] 0.98
table(y, y_)
## y_
## y FALSE TRUE
## FALSE 49 1
## TRUE 1 49
set.seed(42)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob = c(0.7,0.3))
train_data <- iris[idx == 1, ]
test_data <- iris[idx == 2, ]
nrow(train_data)
## [1] 66
nrow(test_data)
## [1] 34
fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = train_data, family = binomial)
predicted_y <- predict(fit, test_data, type ='response')
y <- test_data$Species == 'virginica'
sum((predicted_y > 0.5) == y) / length(y)
## [1] 0.9411765
predicted_y2 <- predict(fit, train_data, type ='response')
y2 <- train_data$Species == 'virginica'
sum((predicted_y2 > 0.5) == y2) / length(y2)
## [1] 0.969697
library(caret)
## Warning: 套件 'caret' 是用 R 版本 4.1.3 來建造的
## 載入需要的套件:ggplot2
## Warning: 套件 'ggplot2' 是用 R 版本 4.1.3 來建造的
## 載入需要的套件:lattice
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(Species~., data=iris, method = "glm",
family = "binomial", trControl=control)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model
## Generalized Linear Model
##
## 100 samples
## 4 predictor
## 2 classes: 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 90, 90, 90, 90, 90, 90, ...
## Resampling results:
##
## Accuracy Kappa
## 0.96 0.92
## ROC
fit <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, family = binomial)
predicted_y <- predict(fit, iris, type ='response')
predicted_y <- predicted_y > 0.8
y <- iris$Species == 'virginica'
table(predicted_y, y)
## y
## predicted_y FALSE TRUE
## FALSE 49 2
## TRUE 1 48
library(ROCR)
## Warning: 套件 'ROCR' 是用 R 版本 4.1.3 來建造的
predictions <- predict(fit, iris, type="response")
y <- iris$Species == 'virginica'
pred.rocr <- prediction(predictions,y)
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))