HW3

library(readr)
## Warning: package 'readr' was built under R version 3.4.4
tb <- read_csv("https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/tb.csv")
## `curl` package not installed, falling back to using `url()`
## Parsed with column specification:
## cols(
##   確定病名 = col_character(),
##   建檔年份 = col_integer(),
##   建檔月份 = col_integer(),
##   縣市 = col_character(),
##   鄉鎮 = col_character(),
##   性別 = col_character(),
##   國籍 = col_character(),
##   年齡層 = col_character(),
##   確定病例數 = col_integer()
## )
#View(tb)

head(tb)
## # A tibble: 6 x 9
##   確定病名 建檔年份 建檔月份 縣市   鄉鎮   性別  國籍   年齡層 確定病例數
##   <chr>       <int>    <int> <chr>  <chr>  <chr> <chr>  <chr>       <int>
## 1 結核病       2005        1 台北市 大安區 F     本國籍 50-54           1
## 2 結核病       2005        1 台東縣 卑南鄉 M     本國籍 70+             1
## 3 結核病       2005        1 台南市 中西區 M     本國籍 55-59           1
## 4 結核病       2005        1 宜蘭縣 南澳鄉 M     本國籍 20-24           1
## 5 結核病       2005        1 桃園市 中壢區 F     本國籍 30-34           1
## 6 結核病       2005        1 桃園市 蘆竹區 M     本國籍 70+             3
class(tb)
## [1] "tbl_df"     "tbl"        "data.frame"
str(tb)
## Classes 'tbl_df', 'tbl' and 'data.frame':    66609 obs. of  9 variables:
##  $ 確定病名  : chr  "結核病" "結核病" "結核病" "結核病" ...
##  $ 建檔年份  : int  2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
##  $ 建檔月份  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ 縣市      : chr  "台北市" "台東縣" "台南市" "宜蘭縣" ...
##  $ 鄉鎮      : chr  "大安區" "卑南鄉" "中西區" "南澳鄉" ...
##  $ 性別      : chr  "F" "M" "M" "M" ...
##  $ 國籍      : chr  "本國籍" "本國籍" "本國籍" "本國籍" ...
##  $ 年齡層    : chr  "50-54" "70+" "55-59" "20-24" ...
##  $ 確定病例數: int  1 1 1 1 1 3 1 9 1 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 9
##   .. ..$ 確定病名  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 建檔年份  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ 建檔月份  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ 縣市      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 鄉鎮      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 性別      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 國籍      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 年齡層    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ 確定病例數: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
summary(tb)
##    確定病名            建檔年份       建檔月份          縣市          
##  Length:66609       Min.   :2005   Min.   : 1.000   Length:66609      
##  Class :character   1st Qu.:2006   1st Qu.: 4.000   Class :character  
##  Mode  :character   Median :2008   Median : 6.000   Mode  :character  
##                     Mean   :2008   Mean   : 6.514                     
##                     3rd Qu.:2010   3rd Qu.:10.000                     
##                     Max.   :2017   Max.   :12.000                     
##      鄉鎮               性別               國籍          
##  Length:66609       Length:66609       Length:66609      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     年齡層            確定病例數   
##  Length:66609       Min.   : 1.00  
##  Class :character   1st Qu.: 1.00  
##  Mode  :character   Median : 1.00  
##                     Mean   : 1.35  
##                     3rd Qu.: 1.00  
##                     Max.   :14.00
tb$確定病名 <- as.factor(tb$確定病名)
tb$縣市 <- as.factor(tb$縣市)
tb$鄉鎮 <- as.factor(tb$鄉鎮)
tb$性別 <- as.factor(tb$性別)
tb$國籍 <- as.factor(tb$國籍)
tb$年齡層 <- as.factor(tb$年齡層)

## 1. 請回答該資料集有多少筆資料?
# method 1
nrow(tb)
## [1] 66609
# method 2
dim(tb)
## [1] 66609     9
# method 3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tb %>% summarise(cnt = n())
## # A tibble: 1 x 1
##     cnt
##   <int>
## 1 66609
# 2. 請計算在哪一年月(建檔年月),本國籍病患有最多病例數?
library(dplyr)
#str(tb)

# SELECT 建檔年份, 建檔月份, sum(確定病例數) FROM tb 
# WHERE 國籍 = '本國籍'  GROUP BY 建檔年份, 建檔月份 
# ORDER BY sum(確定病例數) DESC LIMIT 1 
tb %>% 
  filter(國籍 == '本國籍') %>%
  group_by(建檔年份, 建檔月份) %>% 
  summarise(sum_of_instances = sum(確定病例數)) %>%
  arrange(desc(sum_of_instances)) %>%
  head(1)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 1 x 3
## # Groups:   建檔年份 [1]
##   建檔年份 建檔月份 sum_of_instances
##      <int>    <int>            <int>
## 1     2005        5             1526
# 繼第3題,請問在有最多病例數的該年月,本國籍病患所處縣市前三名為?
# SELECT 縣市, sum(確定病例數) FROM tb WHERE 建檔年份 = 2006 & 建檔月份 = 5 & 國籍 = '本國籍' GROUP BY 縣市 ORDER BY sum(確定病例數) DESC LIMIT 3

tb %>%
  filter(建檔年份 == 2006 & 建檔月份 == 5 & 國籍 == '本國籍') %>%
  group_by(縣市) %>%
  summarise(sum_of_instances = sum(確定病例數)) %>% 
  arrange(desc(sum_of_instances)) %>%
  head(3) %>%
  select('縣市')
## # A tibble: 3 x 1
##   縣市  
##   <fct> 
## 1 高雄市
## 2 新北市
## 3 台中市
# 4. 請找出本國籍病患在哪一個性別及年齡層病例數最多?
# SELECT 性別, 年齡層, SUM(確定病例數) FROM tb WHERE 國籍 = '本國籍'
# GROUP BY 性別, 年齡層 ORDER BY SUM(確定病例數) DESC LIMIT 1 
tb %>%
  filter(國籍 == '本國籍') %>%
  select('性別', '年齡層', '確定病例數') %>%
  group_by(性別, 年齡層) %>%
  summarise(sum_of_instances = sum(確定病例數)) %>%
  arrange(desc(sum_of_instances)) %>%
  head(1)
## # A tibble: 1 x 3
## # Groups:   性別 [1]
##   性別  年齡層 sum_of_instances
##   <fct> <fct>             <int>
## 1 M     70+               28051
# 5. 請統計病例來源地(國籍/縣市/鄉鎮)一共有多少種不重複組合?
# SELECT DISTINCT(國籍, 縣市, 鄉鎮) FROM tb;
tb %>%
  select('國籍', '縣市', '鄉鎮') %>%
  summarise(n_distinct(國籍, 縣市, 鄉鎮))
## # A tibble: 1 x 1
##   `n_distinct(國籍, 縣市, 鄉鎮)`
##                            <int>
## 1                            482
# 6. 請合併tb表跟people表,並找出感染結核病比例最高的前三縣市?
people <- read_csv('https://raw.githubusercontent.com/ywchiu/cdc_course/master/data/people.csv')
## `curl` package not installed, falling back to using `url()`
## Parsed with column specification:
## cols(
##   區域別 = col_character(),
##   總計 = col_number()
## )
View(people)

sub('台', '臺', '台北市')
## [1] "臺北市"
stat <- tb %>%
  group_by(縣市) %>%
  summarize(sum_of_instances = sum(確定病例數)) %>%
  mutate(區域別 = sub('台', '臺', 縣市))

?join
## starting httpd help server ...
##  done
a <- data.frame(c1=c('A','B','C'), c2 = c(1,2,3))
a
##   c1 c2
## 1  A  1
## 2  B  2
## 3  C  3
b <- data.frame(c1=c('B','C','D'), c2 = c(2,3,4))
b
##   c1 c2
## 1  B  2
## 2  C  3
## 3  D  4
inner_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
##   c1 c2.x c2.y
## 1  B    2    2
## 2  C    3    3
left_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
##   c1 c2.x c2.y
## 1  A    1   NA
## 2  B    2    2
## 3  C    3    3
right_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
##   c1 c2.x c2.y
## 1  B    2    2
## 2  C    3    3
## 3  D   NA    4
full_join(a, b, by='c1')
## Warning: Column `c1` joining factors with different levels, coercing to
## character vector
##   c1 c2.x c2.y
## 1  A    1   NA
## 2  B    2    2
## 3  C    3    3
## 4  D   NA    4
# method 1
head(stat)
## # A tibble: 6 x 3
##   縣市   sum_of_instances 區域別
##   <fct>             <int> <chr> 
## 1 台中市             8696 臺中市
## 2 台北市             7399 臺北市
## 3 台東縣             1528 臺東縣
## 4 台南市             7334 臺南市
## 5 宜蘭縣             2044 宜蘭縣
## 6 花蓮縣             2350 花蓮縣
#str(people)
#inner_join(stat, people, by=c('縣市' = '區域別'))
m <- inner_join(stat, people, by='區域別')



# method 2
#names(people) <- c('縣市', '總計')
#m <- inner_join(stat, people, by='縣市')

head(m)
## # A tibble: 6 x 4
##   縣市   sum_of_instances 區域別     總計
##   <fct>             <int> <chr>     <dbl>
## 1 台中市             8696 臺中市 2790381.
## 2 台北市             7399 臺北市 2681375.
## 3 台東縣             1528 臺東縣  219436.
## 4 台南市             7334 臺南市 1886074.
## 5 宜蘭縣             2044 宜蘭縣  456259.
## 6 花蓮縣             2350 花蓮縣  328853.
# method 1
m$ratio <- m$sum_of_instances / m$總計
m %>% arrange(desc(ratio)) %>% head(3) %>% select('區域別')
## # A tibble: 3 x 1
##   區域別
##   <chr> 
## 1 花蓮縣
## 2 臺東縣
## 3 屏東縣
# method 2
m %>% 
  mutate(ratio = sum_of_instances / 總計) %>%
  arrange(desc(ratio)) %>%
  head(3) %>%
  select('區域別')
## # A tibble: 3 x 1
##   區域別
##   <chr> 
## 1 花蓮縣
## 2 臺東縣
## 3 屏東縣

Line Chart

x <- seq(1,6)
y <- x
y
## [1] 1 2 3 4 5 6
plot(x,y, type='b')

types <- c('p', 'l', 'o', 'b', 'c', 's', 'h', 'n')

# generate graph with 2 rows and 4 columns
# method 1
par(mfrow=c(2,4))
for (i in 1:length(types)){
  #print(types[i])
  plot(x,y,types[i], main = paste('type:', types[i]))
}

par(mfrow=c(1,1))
plot(x,y, type= 'n')
lines(x,y, type= 'l', col='red')
lines(x,y, type= 'p', col='blue')

# method 2
par(mfrow=c(4,2))
for (i in 1:length(types)){
  #print(types[i])
  plot(x,y,type = 'n', main = paste('type:', types[i]))
  lines(x,y, type=types[i])
}

par(mfrow=c(1,1))
plot(x,y, type= 'l', col='red', lty=3, lwd= 5)

plot(x,y, type= 'b', col='red', lty=3, lwd= 5, pch=17)

?plot


Taipei <- c(92.5,132.6,168.8,159.1,218.7)
Tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot(Taipei, type= 'o', ylim=c(0,220), col='blue', xlab='Month', ylab = 'Rainfall', main = 'Taipei Rainfall v.s. Tainan Rainfall')
lines(Tainan, type= 'o',col='red',pch=22, lty=2)

plot(Taipei, type= 'o', ylim=c(0,260), xlim=c(1,6), col='blue', xlab='Month', ylab = 'Rainfall', main = 'Taipei Rainfall v.s. Tainan Rainfall')
lines(Tainan, type= 'o',col='red',pch=22, lty=2)
text(5.5, 230, 'Taipei', col='blue')
text(5.5, 200, 'Tainan', col='red')

Bar Chart

housePrice <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv', header = TRUE)
bedroomTable <- table(housePrice$Bedrooms)
barplot(bedroomTable, main = 'Bedroom Type Count', xlab = 'Bedroom Type', ylab= 'count', col='blue')

# https://color.adobe.com/zh/
barplot(bedroomTable, main = 'Bedroom Type Count', xlab = 'Bedroom Type', ylab= 'count', col=c('#FFE180', '#FFBE6c', '#E8834B', '#C03F49'))

grades <- c(80,82,84,88)
barplot(grades)

barplot(grades - 70)

Histogram

par(mfrow=c(1,1))
load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
hist(cdc$weight, breaks = 50)

hist(cdc$weight, breaks = 300)

sort(table(cdc$weight), decreasing = TRUE)
## 
## 160 150 180 170 200 140 190 165 130 175 145 135 185 155 125 120 210 195 
## 992 970 933 922 805 794 715 692 627 626 615 589 577 527 473 440 431 393 
## 220 230 115 110 205 215 240 250 225 138 235 128 168 105 148 132 142 178 
## 376 268 244 235 230 206 204 202 196 144 137 125 122 112 111 110 110 106 
## 260 118 158 162 172 100 152 122 127 134 198 300 112 245 123 182 192 137 
## 104 102 102  96  95  94  80  74  71  71  70  70  69  69  65  64  64  62 
## 143 147 136 163 126 124 280 270 108 174 173 156 188 117 157 153 154 187 
##  62  62  60  60  59  58  57  56  55  53  50  49  49  48  48  47  47  47 
## 133 164 167 183 212 146 186 144 149 176 184 275 290 114 129 179 204 265 
##  46  46  45  45  45  43  42  41  40  40  40  40  40  39  37  37  36  36 
## 113 197 218 107 116 189 208 119 203 193 194 103 139 141 196 159 169 255 
##  34  34  34  33  33  33  33  32  32  31  31  30  30  30  29  28  28  27 
## 131 166 171  98 151 106 121 202  95 177 228 207 285 350 161 206 102 199 
##  26  26  26  25  25  24  24  24  22  22  22  21  21  21  19  19  18  18 
## 216 222 248 104 109 214  90 238 310 320 181 217 219 236 295 209 227 242 
##  18  18  18  17  14  14  12  12  12  12  11  11  11  11  11  10  10  10 
## 191 213 232 252 201 224 226 234 257  92 111 211 223 315  93  97 101 246 
##   9   9   9   9   8   8   8   8   8   7   7   7   7   7   6   6   6   6 
## 262 278 330 340  99 237 258 400  85  88 233 239 241 243 253 256 263 267 
##   6   6   6   6   5   5   5   5   4   4   4   4   4   4   4   4   4   4 
## 268 305 380  84  94  96 272 274 286 298 325  78  80  82 231 247 249 254 
##   4   4   4   3   3   3   3   3   3   3   3   2   2   2   2   2   2   2 
## 276 279 282 283 287 292 360 362 385  68  70  79  83  86 221 229 244 271 
##   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1 
## 273 294 296 297 308 309 313 318 319 324 327 328 344 348 364 370 371 390 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 405 495 500 
##   1   1   1
table(cdc$weight %% 10)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 9421  207  919  545  525 5865  481  543 1159  335
par(mfrow=c(2,1))
hist(cdc$weight, breaks = 50)
barplot(table(cdc$weight))

par(mfrow=c(2,1))
hist(cdc$weight, breaks = 20000, xlim=c(70,380))
barplot(table(cdc$weight), xlim=c(0,300))

Pie Chart

housePrice <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv', header = TRUE)
bedroomTable <- table(housePrice$Bedrooms)
barplot(bedroomTable)

Labels<-c("2 unit", "3 unit", "4 unit", "5 unit")
pie(bedroomTable, labels = Labels, col=c('#FFE180', '#FFBE6c', '#E8834B', '#C03F49'))

rainbow(4)
## [1] "#FF0000FF" "#80FF00FF" "#00FFFFFF" "#8000FFFF"
pie(bedroomTable, labels = Labels, col=rainbow(4),main = 'Bedroom Pie Chart')

tb <- sort(bedroomTable, decreasing = TRUE)
pie(tb, labels = c('3 Units', '2 Units', '4 Units', '5 Units'), init.angle = 90, clockwise = TRUE, col=rainbow(4),main = 'Bedroom Pie Chart')