Text book: Linear Models with R, 2nd, Julian J. Faraway
Package faraway 裡面收集了這本書用到的資料。
#install.packages("faraway")
library(faraway)
## Warning: 套件 'faraway' 是用 R 版本 4.3.1 來建造的
data(pima)
? pima
## 開啟 httpd 求助伺服器… 好了
資料 pima:
Description: The National Institute of Diabetes and Digestive and Kidney Diseases conducted a study on 768 adult female Pima Indians living near Phoenix.
Format: The dataset contains the following variables
pregnant: Number of times pregnant
glucose: Plasma glucose concentration at 2 hours in an oral glucose tolerance test
diastolic: Diastolic blood pressure (mm Hg)
triceps: Triceps skin fold thickness (mm)
insulin: 2-Hour serum insulin (mu U/ml)
bmi: Body mass index (weight in kg/(height in metres squared))
diabetes: Diabetes pedigree function
age: Age (years)
test: test whether the patient shows signs of diabetes (coded 0 if negative, 1 if positive)
-資料整理: missing data, data type
dim(pima)
## [1] 768 9
head(pima) #列出前六筆資料
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 6 148 72 35 0 33.6 0.627 50 1
## 2 1 85 66 29 0 26.6 0.351 31 0
## 3 8 183 64 0 0 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 0 0 25.6 0.201 30 0
str(pima)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant : int 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ diastolic: int 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ bmi : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ diabetes : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ test : int 1 0 1 0 1 0 1 0 1 1 ...
summary(pima)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin bmi diabetes age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## test
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
head(sort(pima$diastolic),100) #sort() 排序
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [26] 0 0 0 0 0 0 0 0 0 0 24 30 30 38 40 44 44 44 44 46 46 48 48 48 48
## [51] 48 50 50 50 50 50 50 50 50 50 50 50 50 50 52 52 52 52 52 52 52 52 52 52 52
## [76] 54 54 54 54 54 54 54 54 54 54 54 55 55 56 56 56 56 56 56 56 56 56 56 56 56
0 為 missing data,將 missing 的資料改成 NA (Not Available)
pima$diastolic[pima$diastolic==0] <- NA
pima$glucose[pima$glucose==0] <- NA
pima$triceps[pima$triceps==0] <- NA
pima$insulin[pima$insulin==0] <- NA
pima$bmi[pima$bmi==0] <- NA
pima$test <- factor(pima$test)
summary(pima$test)
## 0 1
## 500 268
levels(pima$test)
## [1] "0" "1"
levels(pima$test) <- c("negative","positive")
summary(pima)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin bmi diabetes age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## test
## negative:500
## positive:268
##
##
##
##
##
-Plot
hist(pima$diastolic, xlab="Diastolic",main="") #histogram
plot(density(pima$diastolic, na.rm=TRUE),main="") #pdf
plot(sort(pima$diastolic),ylab="Sorted Diastoic")
plot(pima$diastolic, pima$diabetes) # scatter plot
boxplot(diabetes~test,pima) #boxplot
plot(diabetes~diastolic, pima)
plot(diabetes~test,pima)