HW3
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
tb <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/tb.csv")
## `curl` package not installed, falling back to using `url()`
## Parsed with column specification:
## cols(
## 確定病名 = col_character(),
## 建檔年份 = col_integer(),
## 建檔月份 = col_integer(),
## 縣市 = col_character(),
## 鄉鎮 = col_character(),
## 性別 = col_character(),
## 國籍 = col_character(),
## 年齡層 = col_character(),
## 確定病例數 = col_integer()
## )
#View(tb)
head(tb)
## # A tibble: 6 x 9
## 確定病名 建檔年份 建檔月份 縣市 鄉鎮 性別 國籍 年齡層 確定病例數
## <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <int>
## 1 結核病 2005 1 台北市 大安區 F 本國籍 50-54 1
## 2 結核病 2005 1 台東縣 卑南鄉 M 本國籍 70+ 1
## 3 結核病 2005 1 台南市 中西區 M 本國籍 55-59 1
## 4 結核病 2005 1 宜蘭縣 南澳鄉 M 本國籍 20-24 1
## 5 結核病 2005 1 桃園市 中壢區 F 本國籍 30-34 1
## 6 結核病 2005 1 桃園市 蘆竹區 M 本國籍 70+ 3
class(tb)
## [1] "tbl_df" "tbl" "data.frame"
str(tb)
## Classes 'tbl_df', 'tbl' and 'data.frame': 66609 obs. of 9 variables:
## $ 確定病名 : chr "結核病" "結核病" "結核病" "結核病" ...
## $ 建檔年份 : int 2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
## $ 建檔月份 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ 縣市 : chr "台北市" "台東縣" "台南市" "宜蘭縣" ...
## $ 鄉鎮 : chr "大安區" "卑南鄉" "中西區" "南澳鄉" ...
## $ 性別 : chr "F" "M" "M" "M" ...
## $ 國籍 : chr "本國籍" "本國籍" "本國籍" "本國籍" ...
## $ 年齡層 : chr "50-54" "70+" "55-59" "20-24" ...
## $ 確定病例數: int 1 1 1 1 1 3 1 9 1 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 9
## .. ..$ 確定病名 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 建檔年份 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ 建檔月份 : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ 縣市 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 鄉鎮 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 性別 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 國籍 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 年齡層 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ 確定病例數: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(tb)
## 確定病名 建檔年份 建檔月份 縣市
## Length:66609 Min. :2005 Min. : 1.000 Length:66609
## Class :character 1st Qu.:2006 1st Qu.: 4.000 Class :character
## Mode :character Median :2008 Median : 6.000 Mode :character
## Mean :2008 Mean : 6.514
## 3rd Qu.:2010 3rd Qu.:10.000
## Max. :2017 Max. :12.000
## 鄉鎮 性別 國籍
## Length:66609 Length:66609 Length:66609
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## 年齡層 確定病例數
## Length:66609 Min. : 1.00
## Class :character 1st Qu.: 1.00
## Mode :character Median : 1.00
## Mean : 1.35
## 3rd Qu.: 1.00
## Max. :14.00
tb$確定病名 <- as.factor(tb$確定病名)
tb$縣市 <- as.factor(tb$縣市)
tb$鄉鎮 <- as.factor(tb$鄉鎮)
tb$性別 <- as.factor(tb$性別)
tb$國籍 <- as.factor(tb$國籍)
tb$年齡層 <- as.factor(tb$年齡層)
## 1. 請回答該資料集有多少筆資料?
# method 1
nrow(tb)
## [1] 66609
# method 2
dim(tb)
## [1] 66609 9
# method 3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tb %>% summarise(cnt = n())
## # A tibble: 1 x 1
## cnt
## <int>
## 1 66609
# 2. 請計算在哪一年月(建檔年月),本國籍病患有最多病例數?
library(dplyr)
#str(tb)
# SELECT 建檔年份, 建檔月份, sum(確定病例數) FROM tb
# WHERE 國籍 = '本國籍' GROUP BY 建檔年份, 建檔月份
# ORDER BY sum(確定病例數) DESC LIMIT 1
tb %>%
filter(國籍 == '本國籍') %>%
group_by(建檔年份, 建檔月份) %>%
summarise(sum_of_instances = sum(確定病例數)) %>%
arrange(desc(sum_of_instances)) %>%
head(1)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 1 x 3
## # Groups: 建檔年份 [1]
## 建檔年份 建檔月份 sum_of_instances
## <int> <int> <int>
## 1 2005 5 1526
# 繼第3題,請問在有最多病例數的該年月,本國籍病患所處縣市前三名為?
# SELECT 縣市, sum(確定病例數) FROM tb WHERE 建檔年份 = 2006 & 建檔月份 = 5 & 國籍 = '本國籍' GROUP BY 縣市 ORDER BY sum(確定病例數) DESC LIMIT 3
tb %>%
filter(建檔年份 == 2006 & 建檔月份 == 5 & 國籍 == '本國籍') %>%
group_by(縣市) %>%
summarise(sum_of_instances = sum(確定病例數)) %>%
arrange(desc(sum_of_instances)) %>%
head(3) %>%
select('縣市')
## # A tibble: 3 x 1
## 縣市
## <fct>
## 1 高雄市
## 2 新北市
## 3 台中市
# 4. 請找出本國籍病患在哪一個性別及年齡層病例數最多?
# SELECT 性別, 年齡層, SUM(確定病例數) FROM tb WHERE 國籍 = '本國籍'
# GROUP BY 性別, 年齡層 ORDER BY SUM(確定病例數) DESC LIMIT 1
tb %>%
filter(國籍 == '本國籍') %>%
select('性別', '年齡層', '確定病例數') %>%
group_by(性別, 年齡層) %>%
summarise(sum_of_instances = sum(確定病例數)) %>%
arrange(desc(sum_of_instances)) %>%
head(1)
## # A tibble: 1 x 3
## # Groups: 性別 [1]
## 性別 年齡層 sum_of_instances
## <fct> <fct> <int>
## 1 M 70+ 28051
# 5. 請統計病例來源地(國籍/縣市/鄉鎮)一共有多少種不重複組合?
# SELECT DISTINCT(國籍, 縣市, 鄉鎮) FROM tb;
tb %>%
select('國籍', '縣市', '鄉鎮') %>%
summarise(n_distinct(國籍, 縣市, 鄉鎮))
## # A tibble: 1 x 1
## `n_distinct(國籍, 縣市, 鄉鎮)`
## <int>
## 1 482
# 6. 請合併tb表跟people表,並找出感染結核病比例最高的前三縣市?
people <- read_csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/people.csv')
## `curl` package not installed, falling back to using `url()`
## Parsed with column specification:
## cols(
## 區域別 = col_character(),
## 總計 = col_number()
## )
View(people)
sub('台', '臺', '台北市')
## [1] "臺北市"
stat <- tb %>%
group_by(縣市) %>%
summarize(sum_of_instances = sum(確定病例數)) %>%
mutate(區域別 = sub('台', '臺', 縣市))
?join
## starting httpd help server ...
## done
a <- data.frame(c1=c('A','B','C'), c2 = c(1,2,3))
a
## c1 c2
## 1 A 1
## 2 B 2
## 3 C 3
b <- data.frame(c1=c('B','C','D'), c2 = c(2,3,4))
b
## c1 c2
## 1 B 2
## 2 C 3
## 3 D 4
inner_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
## c1 c2.x c2.y
## 1 B 2 2
## 2 C 3 3
left_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
## c1 c2.x c2.y
## 1 A 1 NA
## 2 B 2 2
## 3 C 3 3
right_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
## c1 c2.x c2.y
## 1 B 2 2
## 2 C 3 3
## 3 D NA 4
full_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
## c1 c2.x c2.y
## 1 A 1 NA
## 2 B 2 2
## 3 C 3 3
## 4 D NA 4
# method 1
head(stat)
## # A tibble: 6 x 3
## 縣市 sum_of_instances 區域別
## <fct> <int> <chr>
## 1 台中市 8696 臺中市
## 2 台北市 7399 臺北市
## 3 台東縣 1528 臺東縣
## 4 台南市 7334 臺南市
## 5 宜蘭縣 2044 宜蘭縣
## 6 花蓮縣 2350 花蓮縣
#str(people)
#inner_join(stat, people, by=c('縣市' = '區域別'))
m <- inner_join(stat, people, by='區域別')
# method 2
#names(people) <- c('縣市', '總計')
#m <- inner_join(stat, people, by='縣市')
head(m)
## # A tibble: 6 x 4
## 縣市 sum_of_instances 區域別 總計
## <fct> <int> <chr> <dbl>
## 1 台中市 8696 臺中市 2790381.
## 2 台北市 7399 臺北市 2681375.
## 3 台東縣 1528 臺東縣 219436.
## 4 台南市 7334 臺南市 1886074.
## 5 宜蘭縣 2044 宜蘭縣 456259.
## 6 花蓮縣 2350 花蓮縣 328853.
# method 1
m$ratio <- m$sum_of_instances / m$總計
m %>% arrange(desc(ratio)) %>% head(3) %>% select('區域別')
## # A tibble: 3 x 1
## 區域別
## <chr>
## 1 花蓮縣
## 2 臺東縣
## 3 屏東縣
# method 2
m %>%
mutate(ratio = sum_of_instances / 總計) %>%
arrange(desc(ratio)) %>%
head(3) %>%
select('區域別')
## # A tibble: 3 x 1
## 區域別
## <chr>
## 1 花蓮縣
## 2 臺東縣
## 3 屏東縣
Line Chart
x <- seq(1,6)
y <- x
y
## [1] 1 2 3 4 5 6
plot(x,y, type='b')

types <- c('p', 'l', 'o', 'b', 'c', 's', 'h', 'n')
# generate graph with 2 rows and 4 columns
# method 1
par(mfrow=c(2,4))
for (i in 1:length(types)){
#print(types[i])
plot(x,y,types[i], main = paste('type:', types[i]))
}

par(mfrow=c(1,1))
plot(x,y, type= 'n')
lines(x,y, type= 'l', col='red')
lines(x,y, type= 'p', col='blue')

# method 2
par(mfrow=c(4,2))
for (i in 1:length(types)){
#print(types[i])
plot(x,y,type = 'n', main = paste('type:', types[i]))
lines(x,y, type=types[i])
}

par(mfrow=c(1,1))
plot(x,y, type= 'l', col='red', lty=3, lwd= 5)

plot(x,y, type= 'b', col='red', lty=3, lwd= 5, pch=17)

?plot
Taipei <- c(92.5,132.6,168.8,159.1,218.7)
Tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)
plot(Taipei, type= 'o', ylim=c(0,220), col='blue', xlab='Month', ylab = 'Rainfall', main = 'Taipei Rainfall v.s. Tainan Rainfall')
lines(Tainan, type= 'o',col='red',pch=22, lty=2)

plot(Taipei, type= 'o', ylim=c(0,260), xlim=c(1,6), col='blue', xlab='Month', ylab = 'Rainfall', main = 'Taipei Rainfall v.s. Tainan Rainfall')
lines(Tainan, type= 'o',col='red',pch=22, lty=2)
text(5.5, 230, 'Taipei', col='blue')
text(5.5, 200, 'Tainan', col='red')

Bar Chart
housePrice <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv', header = TRUE)
bedroomTable <- table(housePrice$Bedrooms)
barplot(bedroomTable, main = 'Bedroom Type Count', xlab = 'Bedroom Type', ylab= 'count', col='blue')

# https://color.adobe.com/zh/
barplot(bedroomTable, main = 'Bedroom Type Count', xlab = 'Bedroom Type', ylab= 'count', col=c('#FFE180', '#FFBE6c', '#E8834B', '#C03F49'))

grades <- c(80,82,84,88)
barplot(grades)

barplot(grades - 70)

Histogram
par(mfrow=c(1,1))
load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
hist(cdc$weight, breaks = 50)

hist(cdc$weight, breaks = 300)

sort(table(cdc$weight), decreasing = TRUE)
##
## 160 150 180 170 200 140 190 165 130 175 145 135 185 155 125 120 210 195
## 992 970 933 922 805 794 715 692 627 626 615 589 577 527 473 440 431 393
## 220 230 115 110 205 215 240 250 225 138 235 128 168 105 148 132 142 178
## 376 268 244 235 230 206 204 202 196 144 137 125 122 112 111 110 110 106
## 260 118 158 162 172 100 152 122 127 134 198 300 112 245 123 182 192 137
## 104 102 102 96 95 94 80 74 71 71 70 70 69 69 65 64 64 62
## 143 147 136 163 126 124 280 270 108 174 173 156 188 117 157 153 154 187
## 62 62 60 60 59 58 57 56 55 53 50 49 49 48 48 47 47 47
## 133 164 167 183 212 146 186 144 149 176 184 275 290 114 129 179 204 265
## 46 46 45 45 45 43 42 41 40 40 40 40 40 39 37 37 36 36
## 113 197 218 107 116 189 208 119 203 193 194 103 139 141 196 159 169 255
## 34 34 34 33 33 33 33 32 32 31 31 30 30 30 29 28 28 27
## 131 166 171 98 151 106 121 202 95 177 228 207 285 350 161 206 102 199
## 26 26 26 25 25 24 24 24 22 22 22 21 21 21 19 19 18 18
## 216 222 248 104 109 214 90 238 310 320 181 217 219 236 295 209 227 242
## 18 18 18 17 14 14 12 12 12 12 11 11 11 11 11 10 10 10
## 191 213 232 252 201 224 226 234 257 92 111 211 223 315 93 97 101 246
## 9 9 9 9 8 8 8 8 8 7 7 7 7 7 6 6 6 6
## 262 278 330 340 99 237 258 400 85 88 233 239 241 243 253 256 263 267
## 6 6 6 6 5 5 5 5 4 4 4 4 4 4 4 4 4 4
## 268 305 380 84 94 96 272 274 286 298 325 78 80 82 231 247 249 254
## 4 4 4 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2
## 276 279 282 283 287 292 360 362 385 68 70 79 83 86 221 229 244 271
## 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
## 273 294 296 297 308 309 313 318 319 324 327 328 344 348 364 370 371 390
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 405 495 500
## 1 1 1
table(cdc$weight %% 10)
##
## 0 1 2 3 4 5 6 7 8 9
## 9421 207 919 545 525 5865 481 543 1159 335
par(mfrow=c(2,1))
hist(cdc$weight, breaks = 50)
barplot(table(cdc$weight))

par(mfrow=c(2,1))
hist(cdc$weight, breaks = 20000, xlim=c(70,380))
barplot(table(cdc$weight), xlim=c(0,300))

Pie Chart
housePrice <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv', header = TRUE)
bedroomTable <- table(housePrice$Bedrooms)
barplot(bedroomTable)

Labels<-c("2 unit", "3 unit", "4 unit", "5 unit")
pie(bedroomTable, labels = Labels, col=c('#FFE180', '#FFBE6c', '#E8834B', '#C03F49'))

rainbow(4)
## [1] "#FF0000FF" "#80FF00FF" "#00FFFFFF" "#8000FFFF"
pie(bedroomTable, labels = Labels, col=rainbow(4),main = 'Bedroom Pie Chart')

tb <- sort(bedroomTable, decreasing = TRUE)
pie(tb, labels = c('3 Units', '2 Units', '4 Units', '5 Units'), init.angle = 90, clockwise = TRUE, col=rainbow(4),main = 'Bedroom Pie Chart')
