Line Chart

x <- 1:6
y <- 1:6

plot(x,y, type='l' )

plot(x,y, type='o' )

x1 <- 2:4
y1 <- c(3,3,3)
plot(x,y,type='n')
lines(x,  y, col="blue")
lines(x1, y1, col="red")

types <- c('p', 'l', 'o', 'b', 'c' , 's', 'h', 'n')
par(mfrow=c(2,4))
for(t in types){
  title <- paste('types:', t)
  plot(x,y, type = 'n', main = title)
  lines(x,y,type=t)
}

par(mfrow=c(1,1))
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot(taipei, type="o", col="blue", ylim=c(0,220), 
     xlab="Month", ylab="Rainfall")

lines(tainan , type="o", pch=22, lty=2, col="red")

Bar Chart

download.file('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/house-prices.csv', 'house-prices.csv')

house <- read.csv('house-prices.csv')
bedroomsTable <- table(house$Bedrooms)

barplot(bedroomsTable, main='Bedrooms Type', xlab = 'Bedroom Type', ylab = 'count', col = "orange", ylim = c(0,80))

barplot(bedroomsTable, main='Bedrooms Type', xlab = 'Bedroom Type', ylab = 'count', col = c(1,2,3,4), ylim = c(0,80))

Histogram

load("D:/OS DATA/Desktop/cdc.Rdata")
hist(cdc$weight)

hist(cdc$height)

hist(cdc$weight, breaks= 500)

head(sort(table(cdc$weight), decreasing = TRUE))
## 
## 160 150 180 170 200 140 
## 992 970 933 922 805 794
table(cdc$weight %% 10)
## 
##    0    1    2    3    4    5    6    7    8    9 
## 9421  207  919  545  525 5865  481  543 1159  335
par(mfrow=c(2,1))
hist(cdc$weight, breaks = 50)
barplot(table(cdc$weight), xlab = 'weight', ylab = 'frequency')

#barplot(ta)

Pie Chart

bedroomsTable
## 
##  2  3  4  5 
## 30 67 29  2
labels <- c('2 unit', '3 unit', '4 unit', '5 unit')
rainbow(length(labels))
## [1] "#FF0000FF" "#80FF00FF" "#00FFFFFF" "#8000FFFF"
tb <- sort(bedroomsTable, decreasing = TRUE)
pie(tb, labels = labels, col = c('#3eb2ad', '#de6000', '#ff6257', '#517d7d'), main='pie chart of bedrooms', init.angle = 90, clockwise = TRUE)

Scatter Chart

plot(cdc$weight, cdc$wtdesire)

plot(cdc$weight, cdc$height)

data(iris)
xlab <- 'Sepal.Length'
ylab <- 'Petal.Length'
class(iris$Species)
## [1] "factor"
plot(iris$Sepal.Length, iris$Petal.Length,col = iris$Species)
abline(h = 4.7, col="orange")
abline(h = 2.4, col="blue")

versicolor <- iris[iris$Species == 'versicolor',]
virginica <- iris[iris$Species == 'virginica',]

plot(versicolor$Petal.Length, versicolor$Petal.Width, col="red", xlim= c(0,6), ylim= c(0,2))
points(virginica$Petal.Length, virginica$Petal.Width, col="orange")

plot(cdc$weight, cdc$wtdesire, xlab = 'Weight', ylab = 'Desire Weight')

fit <- lm(wtdesire ~ weight, data = cdc)
fit
## 
## Call:
## lm(formula = wtdesire ~ weight, data = cdc)
## 
## Coefficients:
## (Intercept)       weight  
##      46.664        0.639
plot(cdc$weight, cdc$wtdesire, xlab = 'Weight', ylab = 'Desire Weight')
abline(fit, col="red")

barplot(c(80, 82 ,84, 88 ))

## Mosaic Plot

table(cdc$gender)
## 
##     m     f 
##  9569 10431
table(cdc$smoke100)
## 
##     0     1 
## 10559  9441
tb <- table(cdc$gender, cdc$smoke100)
colnames(tb) <- c('Non-Smokers', 'Smokers')
rownames(tb) <- c('Male', 'Female')
mosaicplot(tb, col= c(2,3))

Box Chart

hist(cdc$height, breaks = 50)

boxplot(cdc$height)

boxplot(cdc$height~ cdc$gender)

salary <- c(80,90,80,85,100,120,150,130,110)
sum(salary) / length(salary)
## [1] 105
mean(salary)
## [1] 105
salary2 <- c(80,90,80,85,100,120,150,130,110,1000)
mean(salary2)
## [1] 194.5
sort(salary2)
##  [1]   80   80   85   90  100  110  120  130  150 1000
median(salary2)
## [1] 105
quantile(salary2, 0.25)
##   25% 
## 86.25
quantile(salary2, 0.75)
##   75% 
## 127.5
quantile(salary2, 0.75) - quantile(salary2, 0.25)
##   75% 
## 41.25
IQR(salary2)
## [1] 41.25
max(min(salary2), median(salary2) - 1.5 * IQR(salary2))
## [1] 80
min(max(salary2), median(salary2) + 1.5 * IQR(salary2))
## [1] 166.875
boxplot(salary2)

boxplot(salary2[salary2  < 200])

## Legend

par(mfrow=c(1,1))
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot(taipei, type="o", col="blue", ylim=c(0,220), 
     xlab="Month", ylab="Rainfall")

lines(tainan , type="o", pch=22, lty=2, col="red")
legend(1, 200, c('taipei', 'tainan'), col = c('blue', 'red'), lwd=c(2.5,2.5) )

plot(taipei, type="o", col="blue", ylim=c(0,220), 
     xlab="Month", ylab="Rainfall")
lines(tainan , type="o", pch=22, lty=2, col="red")
legend('topleft', c('taipei', 'tainan'), col = c('blue', 'red'), lwd=c(2.5,2.5) )

plot(taipei, type="o", col="blue", ylim=c(0,220), 
     xlab="Month", ylab="Rainfall")
lines(tainan , type="o", pch=22, lty=2, col="red")
text(1.5,200, 'Let the rain falls')

plot(taipei, type="o", col="blue", ylim=c(0,250), xlim=c(1,6),
     xlab="Month", ylab="Rainfall")
lines(tainan , type="o", pch=22, lty=2, col="red")
text(5.2, 230, 'Taipei', col="blue")
text(5.2, 170, 'Tainan', col="red")

tb <- sort(bedroomsTable, decreasing = TRUE)
pie(tb, labels = labels, col = c('#3eb2ad', '#de6000', '#ff6257', '#517d7d'), main='pie chart of bedrooms', init.angle = 90, clockwise = TRUE)
legend("bottomright", labels,
    fill=c('#3eb2ad', '#de6000', '#ff6257', '#517d7d'), title = "units", cex=0.8)

Par

showLayout = function(n){
for(i in 1:n){
plot(1,type="n",xaxt="n",yaxt="n",xlab="",ylab="")
text(1, 1, labels=i, cex=10)
    }
}

par(mar=c(1,1,1,1),mfrow=c(3,2))
showLayout(6)

par(mar=c(3,3,3,3),mfrow=c(3,2))
showLayout(6)

par(mar=c(3,3,3,3),mfcol=c(3,2))
showLayout(6)

Output Graphics

jpeg('cdc.jpg')
plot(cdc$weight, cdc$wtdesire)
dev.off()
## png 
##   2

dplyr

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(Titanic)
titanic <- data.frame(Titanic)
titanic[ (titanic$Sex == 'Male') & (titanic$Age == 'Adult') , ]
##    Class  Sex   Age Survived Freq
## 9    1st Male Adult       No  118
## 10   2nd Male Adult       No  154
## 11   3rd Male Adult       No  387
## 12  Crew Male Adult       No  670
## 25   1st Male Adult      Yes   57
## 26   2nd Male Adult      Yes   14
## 27   3rd Male Adult      Yes   75
## 28  Crew Male Adult      Yes  192
filter(titanic, Sex == 'Male', Age == 'Adult')
##   Class  Sex   Age Survived Freq
## 1   1st Male Adult       No  118
## 2   2nd Male Adult       No  154
## 3   3rd Male Adult       No  387
## 4  Crew Male Adult       No  670
## 5   1st Male Adult      Yes   57
## 6   2nd Male Adult      Yes   14
## 7   3rd Male Adult      Yes   75
## 8  Crew Male Adult      Yes  192
filter(titanic, Sex == 'Male' | Age == 'Adult')
##    Class    Sex   Age Survived Freq
## 1    1st   Male Child       No    0
## 2    2nd   Male Child       No    0
## 3    3rd   Male Child       No   35
## 4   Crew   Male Child       No    0
## 5    1st   Male Adult       No  118
## 6    2nd   Male Adult       No  154
## 7    3rd   Male Adult       No  387
## 8   Crew   Male Adult       No  670
## 9    1st Female Adult       No    4
## 10   2nd Female Adult       No   13
## 11   3rd Female Adult       No   89
## 12  Crew Female Adult       No    3
## 13   1st   Male Child      Yes    5
## 14   2nd   Male Child      Yes   11
## 15   3rd   Male Child      Yes   13
## 16  Crew   Male Child      Yes    0
## 17   1st   Male Adult      Yes   57
## 18   2nd   Male Adult      Yes   14
## 19   3rd   Male Adult      Yes   75
## 20  Crew   Male Adult      Yes  192
## 21   1st Female Adult      Yes  140
## 22   2nd Female Adult      Yes   80
## 23   3rd Female Adult      Yes   76
## 24  Crew Female Adult      Yes   20
filter(titanic, Sex == 'Male' & Age == 'Adult')
##   Class  Sex   Age Survived Freq
## 1   1st Male Adult       No  118
## 2   2nd Male Adult       No  154
## 3   3rd Male Adult       No  387
## 4  Crew Male Adult       No  670
## 5   1st Male Adult      Yes   57
## 6   2nd Male Adult      Yes   14
## 7   3rd Male Adult      Yes   75
## 8  Crew Male Adult      Yes  192
filter(titanic, Class %in% c('1st', 'Crew') )
##    Class    Sex   Age Survived Freq
## 1    1st   Male Child       No    0
## 2   Crew   Male Child       No    0
## 3    1st Female Child       No    0
## 4   Crew Female Child       No    0
## 5    1st   Male Adult       No  118
## 6   Crew   Male Adult       No  670
## 7    1st Female Adult       No    4
## 8   Crew Female Adult       No    3
## 9    1st   Male Child      Yes    5
## 10  Crew   Male Child      Yes    0
## 11   1st Female Child      Yes    1
## 12  Crew Female Child      Yes    0
## 13   1st   Male Adult      Yes   57
## 14  Crew   Male Adult      Yes  192
## 15   1st Female Adult      Yes  140
## 16  Crew Female Adult      Yes   20
titanic[, c('Sex', 'Age')]
##       Sex   Age
## 1    Male Child
## 2    Male Child
## 3    Male Child
## 4    Male Child
## 5  Female Child
## 6  Female Child
## 7  Female Child
## 8  Female Child
## 9    Male Adult
## 10   Male Adult
## 11   Male Adult
## 12   Male Adult
## 13 Female Adult
## 14 Female Adult
## 15 Female Adult
## 16 Female Adult
## 17   Male Child
## 18   Male Child
## 19   Male Child
## 20   Male Child
## 21 Female Child
## 22 Female Child
## 23 Female Child
## 24 Female Child
## 25   Male Adult
## 26   Male Adult
## 27   Male Adult
## 28   Male Adult
## 29 Female Adult
## 30 Female Adult
## 31 Female Adult
## 32 Female Adult
select(titanic, Sex, Age)
##       Sex   Age
## 1    Male Child
## 2    Male Child
## 3    Male Child
## 4    Male Child
## 5  Female Child
## 6  Female Child
## 7  Female Child
## 8  Female Child
## 9    Male Adult
## 10   Male Adult
## 11   Male Adult
## 12   Male Adult
## 13 Female Adult
## 14 Female Adult
## 15 Female Adult
## 16 Female Adult
## 17   Male Child
## 18   Male Child
## 19   Male Child
## 20   Male Child
## 21 Female Child
## 22 Female Child
## 23 Female Child
## 24 Female Child
## 25   Male Adult
## 26   Male Adult
## 27   Male Adult
## 28   Male Adult
## 29 Female Adult
## 30 Female Adult
## 31 Female Adult
## 32 Female Adult
select(titanic, Sex:Survived)
##       Sex   Age Survived
## 1    Male Child       No
## 2    Male Child       No
## 3    Male Child       No
## 4    Male Child       No
## 5  Female Child       No
## 6  Female Child       No
## 7  Female Child       No
## 8  Female Child       No
## 9    Male Adult       No
## 10   Male Adult       No
## 11   Male Adult       No
## 12   Male Adult       No
## 13 Female Adult       No
## 14 Female Adult       No
## 15 Female Adult       No
## 16 Female Adult       No
## 17   Male Child      Yes
## 18   Male Child      Yes
## 19   Male Child      Yes
## 20   Male Child      Yes
## 21 Female Child      Yes
## 22 Female Child      Yes
## 23 Female Child      Yes
## 24 Female Child      Yes
## 25   Male Adult      Yes
## 26   Male Adult      Yes
## 27   Male Adult      Yes
## 28   Male Adult      Yes
## 29 Female Adult      Yes
## 30 Female Adult      Yes
## 31 Female Adult      Yes
## 32 Female Adult      Yes
select(titanic, contains('S'))
##    Class    Sex Survived
## 1    1st   Male       No
## 2    2nd   Male       No
## 3    3rd   Male       No
## 4   Crew   Male       No
## 5    1st Female       No
## 6    2nd Female       No
## 7    3rd Female       No
## 8   Crew Female       No
## 9    1st   Male       No
## 10   2nd   Male       No
## 11   3rd   Male       No
## 12  Crew   Male       No
## 13   1st Female       No
## 14   2nd Female       No
## 15   3rd Female       No
## 16  Crew Female       No
## 17   1st   Male      Yes
## 18   2nd   Male      Yes
## 19   3rd   Male      Yes
## 20  Crew   Male      Yes
## 21   1st Female      Yes
## 22   2nd Female      Yes
## 23   3rd Female      Yes
## 24  Crew Female      Yes
## 25   1st   Male      Yes
## 26   2nd   Male      Yes
## 27   3rd   Male      Yes
## 28  Crew   Male      Yes
## 29   1st Female      Yes
## 30   2nd Female      Yes
## 31   3rd Female      Yes
## 32  Crew Female      Yes
sum(tail(head(iris[iris$Sepal.Length> 5, ]), 3)$Sepal.Length)
## [1] 16.9
library(magrittr)
iris %>% .[iris$Sepal.Length > 5, ] %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 16.9
filter(select(titanic, Sex, Age), Age == 'Child')
##       Sex   Age
## 1    Male Child
## 2    Male Child
## 3    Male Child
## 4    Male Child
## 5  Female Child
## 6  Female Child
## 7  Female Child
## 8  Female Child
## 9    Male Child
## 10   Male Child
## 11   Male Child
## 12   Male Child
## 13 Female Child
## 14 Female Child
## 15 Female Child
## 16 Female Child
titanic %>% 
  select(Sex, Age) %>%
  filter(Age == 'Child')
##       Sex   Age
## 1    Male Child
## 2    Male Child
## 3    Male Child
## 4    Male Child
## 5  Female Child
## 6  Female Child
## 7  Female Child
## 8  Female Child
## 9    Male Child
## 10   Male Child
## 11   Male Child
## 12   Male Child
## 13 Female Child
## 14 Female Child
## 15 Female Child
## 16 Female Child
titanic %>% 
  select(Sex, Class, Age, Freq) %>%
  filter(Age == 'Child') %>%
  arrange(Freq)
##       Sex Class   Age Freq
## 1    Male   1st Child    0
## 2    Male   2nd Child    0
## 3    Male  Crew Child    0
## 4  Female   1st Child    0
## 5  Female   2nd Child    0
## 6  Female  Crew Child    0
## 7    Male  Crew Child    0
## 8  Female  Crew Child    0
## 9  Female   1st Child    1
## 10   Male   1st Child    5
## 11   Male   2nd Child   11
## 12   Male   3rd Child   13
## 13 Female   2nd Child   13
## 14 Female   3rd Child   14
## 15 Female   3rd Child   17
## 16   Male   3rd Child   35
titanic %>% 
  select(Sex, Class, Age, Freq) %>%
  filter(Age == 'Child') %>%
  arrange(desc(Freq) )
##       Sex Class   Age Freq
## 1    Male   3rd Child   35
## 2  Female   3rd Child   17
## 3  Female   3rd Child   14
## 4    Male   3rd Child   13
## 5  Female   2nd Child   13
## 6    Male   2nd Child   11
## 7    Male   1st Child    5
## 8  Female   1st Child    1
## 9    Male   1st Child    0
## 10   Male   2nd Child    0
## 11   Male  Crew Child    0
## 12 Female   1st Child    0
## 13 Female   2nd Child    0
## 14 Female  Crew Child    0
## 15   Male  Crew Child    0
## 16 Female  Crew Child    0
freqsum <- titanic$Freq %>% sum() 

titanic <- titanic %>% 
  select(Sex, Age, Freq) %>%
  mutate(portion = Freq/ freqsum)

head(titanic)
##      Sex   Age Freq    portion
## 1   Male Child    0 0.00000000
## 2   Male Child    0 0.00000000
## 3   Male Child   35 0.01590186
## 4   Male Child    0 0.00000000
## 5 Female Child    0 0.00000000
## 6 Female Child    0 0.00000000
# select sex, sum(freq) from titanic group by sex
sex_ratio <- titanic %>%
  select(Sex, Freq) %>%
  group_by(Sex) %>%
  summarise(Sexsum = sum(Freq, na.rm=TRUE))

pie(sex_ratio$Sexsum, labels = sex_ratio$Sex, init.angle = 90, clockwise = TRUE, col = c('blue', 'red'))

titanic %>% 
  group_by(Sex) %>%
  summarise_each(funs(sum), Freq, portion)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## # A tibble: 2 x 3
##      Sex  Freq   portion
##   <fctr> <dbl>     <dbl>
## 1   Male  1731 0.7864607
## 2 Female   470 0.2135393
titanic %>% 
  group_by(Sex) %>%
  summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), Freq)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## # A tibble: 2 x 3
##      Sex Freq_min Freq_max
##   <fctr>    <dbl>    <dbl>
## 1   Male        0      670
## 2 Female        0      140
titanic %>% 
  group_by(Sex) %>%
  summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches('Freq') )
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## # A tibble: 2 x 3
##      Sex Freq_min Freq_max
##   <fctr>    <dbl>    <dbl>
## 1   Male        0      670
## 2 Female        0      140
titanic %>%
  select(Sex) %>%
  summarise_each(funs(n()))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
##   Sex
## 1  32
titanic %>%
  select(Sex) %>%
  summarise_each(funs(n_distinct(Sex)))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
##   Sex
## 1   2
# select age, sex, sum(freq) as frequency_sum from titanic
# group by age, sex order by frequency_sum desc
titanic %>%
  group_by(Age, Sex) %>%
  summarise(frequency_sum = sum(Freq)) %>%
  arrange(desc(frequency_sum) )
## # A tibble: 4 x 3
## # Groups:   Age [2]
##      Age    Sex frequency_sum
##   <fctr> <fctr>         <dbl>
## 1  Adult   Male          1667
## 2  Adult Female           425
## 3  Child   Male            64
## 4  Child Female            45
# selec sex, sum(Freq) from titanic group by Sex
sex_stat <- titanic %>%
  group_by(Sex) %>%
  summarise(sexsum = sum(Freq))

pie(sex_stat$sexsum, labels = sex_stat$Sex, init.angle = 90, clockwise = TRUE)

barplot(height = sex_stat$sexsum,names.arg= sex_stat$Sex, ylim = c(0,2000), col = c('blue', 'red'), main = 'Male v.s. Female', ylab = 'Frequency' )

# Stat by Age
age_stat <- titanic %>%
  group_by(Age) %>%
  summarise(sexsum = sum(Freq))
age_stat
## # A tibble: 2 x 2
##      Age sexsum
##   <fctr>  <dbl>
## 1  Child    109
## 2  Adult   2092
#age_stat


data(Titanic)
titanic <- data.frame(Titanic)

survived_stat <- titanic %>% 
  group_by(Sex, Survived) %>%
  summarise_each(funs(sum), Freq)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
library(reshape2)
survived_tb <- dcast(survived_stat, Survived ~ Sex, value.var = 'Freq')
#survived_tb[2:3]
mosaicplot(survived_tb[2:3], main = 'Survived by Sex')

m <- as.matrix(survived_tb[2:3])
m
##      Male Female
## [1,] 1364    126
## [2,]  367    344
barplot(m, legend.text = c('Perished', 'Survived'))

姓名分析

download.file('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/WA.TXT', 'WA.txt')

babyname = read.csv("WA.txt", header=FALSE)
head(babyname)
##   V1 V2   V3       V4  V5
## 1 WA  F 1910    Helen 156
## 2 WA  F 1910  Dorothy 122
## 3 WA  F 1910     Mary 112
## 4 WA  F 1910 Margaret 104
## 5 WA  F 1910     Ruth  94
## 6 WA  F 1910    Alice  67
colnames(babyname) = c("state", "sex", "year", "name", "freq")


library(dplyr)
babyname %>% 
  group_by(name) %>%
  summarise(name_freq = sum(freq)) %>% 
  arrange(desc(name_freq)) %>%
  head(10)
## # A tibble: 10 x 2
##           name name_freq
##         <fctr>     <int>
##  1     Michael     69037
##  2      Robert     65322
##  3       David     59728
##  4        John     59124
##  5       James     58379
##  6     William     44416
##  7     Richard     39892
##  8        Mary     33980
##  9      Daniel     32935
## 10 Christopher     30595
babyname %>%
  filter(sex == 'F') %>%
  group_by(name) %>%
  summarise(name_freq = sum(freq)) %>% 
  arrange(desc(name_freq)) %>%
  head(10)
## # A tibble: 10 x 2
##         name name_freq
##       <fctr>     <int>
##  1      Mary     33970
##  2  Jennifer     26280
##  3     Linda     22702
##  4  Patricia     21886
##  5 Elizabeth     20097
##  6     Susan     19565
##  7   Jessica     19227
##  8   Barbara     19137
##  9     Sarah     18528
## 10     Karen     16991
babyname %>%
  filter(sex == 'M') %>%
  group_by(name) %>%
  summarise(name_freq = sum(freq)) %>% 
  arrange(desc(name_freq)) %>%
  head(10)
## # A tibble: 10 x 2
##           name name_freq
##         <fctr>     <int>
##  1     Michael     68741
##  2      Robert     65219
##  3       David     59617
##  4        John     59092
##  5       James     58315
##  6     William     44401
##  7     Richard     39892
##  8      Daniel     32884
##  9 Christopher     30504
## 10      Thomas     26549
max(babyname$year)
## [1] 2013
top10_male <- babyname %>%
  filter(sex == 'M', year== 2013) %>%
  group_by(name) %>%
  summarise(name_freq = sum(freq)) %>% 
  arrange(desc(name_freq)) %>%
  head(10)


top10_female <- babyname %>%
  filter(sex == 'F', year== 2013) %>%
  group_by(name) %>%
  summarise(name_freq = sum(freq)) %>% 
  arrange(desc(name_freq)) %>%
  head(10)


top10_female_history <- babyname %>%
  select(year, name, freq, sex) %>%
  filter(name %in% top10_female$name, sex == 'F')




top10_names <- top10_female$name
emma <- top10_female_history %>% filter(name == 'Emma')

plot(emma$year, emma$freq, type = 'n', xlab = 'year', ylab = 'frequency', main = 'top10 female name trend in 2013')
for (n in seq_along(top10_names)){
  print(n)
  data <- top10_female_history %>% filter(name == top10_names[n])
  lines(data$year, data$freq, type='l', col=n )
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
legend(1910, 600, legend = top10_names, col=1:10, lwd=2.5)

david <- babyname %>%
  filter(name == 'David', sex == 'M')

plot(david$year, david$freq, type = 'l')

## 實價登錄分析

download.file('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/lvr_prices.csv', 'lvr_price.csv')

library(readr)
lvr_price <- read_csv("D:/OS DATA/Desktop/lvr_price.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
#View(lvr_price)


head(lvr_price)
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1     0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2     1 中正區 房地(土地+建物)     臺北市中正區忠孝東路二段121~150號
## 3     2 大同區            土地               橋北段二小段601~630地號
## 4     3 大同區 房地(土地+建物)       臺北市大同區重慶北路一段61~90號
## 5     4 內湖區 房地(土地+建物) 臺北市內湖區民權東路六段90巷6弄1~30號
## 6     5 信義區            土地               福德段一小段661~690地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
daan <- lvr_price[lvr_price$area == '大安區', ]
sum(as.numeric(daan$total_price), na.rm=TRUE)
## [1] 2.79477e+11
mean(as.numeric(daan$total_price), na.rm=TRUE)
## [1] 29798170
median(as.numeric(daan$total_price), na.rm=TRUE)
## [1] 2e+07
zhongshan <- lvr_price[lvr_price$area == '中山區', c('address','total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res
## # A tibble: 15,020 x 2
##                                address total_price
##                                  <chr>       <int>
##  1 臺北市中山區建國北路一段138巷1~30號  1850000000
##  2      臺北市中山區南京東路三段1~30號  1400000000
##  3               中山段二小段31~60地號  1084948034
##  4             中山段三小段301~330地號  1011136500
##  5                     金泰段61~90地號   952875000
##  6             中山段一小段361~390地號   903865500
##  7             正義段三小段331~360地號   850093721
##  8   臺北市中山區中山北路二段181~210號   760000000
##  9      臺北市中山區民權東路三段1~30號   600000000
## 10           臺北市中山區一江街31~60號   500000000
## # ... with 15,010 more rows
getTopThree <- function(area){
  zhongshan <- lvr_price[lvr_price$area ==area, ]
  idx <- order(zhongshan$total_price, decreasing = TRUE)
  res <- zhongshan[idx, c('address', 'total_price', 'area')]
  return(res[1:3,])
}

getTopThree('中山區')
## # A tibble: 3 x 3
##                               address total_price   area
##                                 <chr>       <int>  <chr>
## 1 臺北市中山區建國北路一段138巷1~30號  1850000000 中山區
## 2      臺北市中山區南京東路三段1~30號  1400000000 中山區
## 3               中山段二小段31~60地號  1084948034 中山區
getTopThree('大安區')
## # A tibble: 3 x 3
##                                  address total_price   area
##                                    <chr>       <int>  <chr>
## 1 臺北市大安區羅斯福路三段283巷4弄1~30號  1869781219 大安區
## 2      臺北市大安區忠孝東路四段241~270號   971340000 大安區
## 3                  學府段三小段31~60地號   966660000 大安區
getTopThree('文山區')
## # A tibble: 3 x 3
##                   address total_price   area
##                     <chr>       <int>  <chr>
## 1 景美段四小段601~630地號  1550000000 文山區
## 2 萬芳段二小段241~270地號   836350000 文山區
## 3 興安段四小段271~300地號   581500000 文山區
price_per_sec <- tapply(lvr_price$total_price, lvr_price$area, function(e)mean(e, na.rm=TRUE))

barplot(sort(price_per_sec, decreasing = TRUE), col='blue', cex.axis = 0.6, cex.names = 0.6)

price_per_sec2 <- tapply(lvr_price$total_price, lvr_price$area, function(e)median(e, na.rm=TRUE))

barplot(sort(price_per_sec2, decreasing = TRUE), col='blue', cex.axis = 0.6, cex.names = 0.6)

boxplot(lvr_price$total_price ~ lvr_price$area)
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

boxplot(log(lvr_price$total_price) ~ lvr_price$area, cex.axis = 0.6, cex.names = 0.6)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

lvr_price %>% 
  select(area, address,total_price) %>%
  filter(area == '中山區') %>%
  arrange(desc(total_price)) %>%
  head(3)
## # A tibble: 3 x 3
##     area                             address total_price
##    <chr>                               <chr>       <int>
## 1 中山區 臺北市中山區建國北路一段138巷1~30號  1850000000
## 2 中山區      臺北市中山區南京東路三段1~30號  1400000000
## 3 中山區               中山段二小段31~60地號  1084948034
lvr_price$trading_ym <- as.Date(format(lvr_price$trading_ymd, '%Y-%m-01'))

lvr_stat <- lvr_price %>%
  select(trading_ym, area, total_price) %>%
  filter(trading_ym >= '2012-01-01') %>%
  group_by(trading_ym, area) %>%
  summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))


daan <- lvr_stat[lvr_stat$area == '大安區',]
#daan
plot(daan$trading_ym, daan$overall_price, type = 'l')

lvr_stat$area <- as.factor(lvr_stat$area)
par(mfrow=c(3,4))
for(a in levels(lvr_stat$area)){
area_data <- lvr_stat[lvr_stat$area == a,]
#daan
plot(area_data$trading_ym, area_data$overall_price, type = 'l', ,main = a)
}

#install.packages('tidyr')
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
## The following object is masked from 'package:magrittr':
## 
##     extract
price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill = 0)
#price_pivot
write.csv(price_pivot, 'taipei_house_price.csv')