In Class Exercise

Exercise 1

Find out the number of days you have spent at NCKU as a registered student or staff person.

abs(as.numeric(difftime(as.Date("2017/01/01"), as.Date(Sys.time()), unit="days")))

## [1] 1239

Exercise 2

Reproduce the plot of calls for police assistances around 24 hours in New York City using the data set here.

…

library(ggplot2)
library(dplyr)
library(hrbrthemes)
dta<-read.csv("C:/Users/USER/Desktop/R_data management/0518/calls_nyc.csv", header=T)
head(dta)

##   Hour Calls
## 1  0.5  1080
## 2  1.5   910
## 3  2.5   770
## 4  3.5   780
## 5  4.5   380
## 6  5.5   390

dta1<-dta%>%mutate(time=c(0:23))
rt<-data.frame(Hour=as.integer(dta1[ ,3]), 
               calls=dta1[,2])

p0<-ggplot(rt, aes(Hour, 
                   calls, 
                   group=1)) +
  geom_bar(width=1, 
           stat="identity", 
           fill="cyan", 
           col="gray", 
           alpha=0.2) +
  geom_abline(intercept=mean(dta$Calls), 
              slope=0, 
              col="pink", size=1.5) +
  coord_polar(theta="x", 
              start=-pi/24) +
  theme_ipsum()+
  labs(x="", y="")

p0+labs(x="Hour", y="Calls")+
   theme(axis.line = element_line(colour = "black"),
         panel.border = element_rect(colour = "black", fill=NA, size=0.5),
         axis.ticks.y=element_line(colour = "black"),
         axis.title.x=element_text(face = "bold", colour = "Black", size = 15, hjust = 0.5),
         axis.title.y=element_text(face = "bold", colour = "Black", size = 15, hjust = 0.5),
         panel.grid.minor.x = element_blank(),
         panel.grid.major.x = element_blank())+
  scale_x_continuous(breaks=seq(0, 23,1)-0.5,
                     labels=c("0",c(rep("", 4)), "5",c(rep("", 4)), "10",c(rep("", 4)), 
                              "15",c(rep("", 4)), "20", c(rep("", 3))))

Exercise 3

Assume that a friend of yours will live to be 100 years old. Find out how often his or her birthday falls on each day of the week. Plot it.

library(lubridate)
m1<-as.Date("1992/03/14")
m2<-as.Date(36525, origin=m1)

bday<-seq(m1, m2, by="days")
daily<-data.frame(table(weekdays(bday)))
daily$Var1 <- factor(daily$Var1, levels=c("星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"))                         
head(daily)

##     Var1 Freq
## 1 星期一 5218
## 2 星期二 5218
## 3 星期三 5218
## 4 星期五 5218
## 5 星期六 5218
## 6 星期日 5218

ggplot(daily, aes(x=Var1, y = Freq)) +
  geom_point(size = 8) + 
  geom_segment(aes(x=Var1, y = 0, xend = Var1, yend = Freq), size = 1.1) +
  theme_classic() + 
  geom_hline(yintercept = 0) +
  geom_text(aes(label = substr(levels(Var1), 3,3)), colour = "white", nudge_y = 0.1)+
  geom_text(aes(label = "5218 \n"), colour = "black", nudge_y = 0.1, lineheight=2.2)+
  labs(y = "Frequency", x="Weekdays") + 
  # coord_flip() + 
  theme(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())

Exercise 4

Reproduce the plot of fertility rate and college acceptance rate in Taiwan from 1981 to 2009 using the data set here.

…

library(reshape)
dta<-read.table("C:/Users/USER/Desktop/R_data management/0518/birth_college.txt", header=T)
str(dta)

## 'data.frame':    29 obs. of  2 variables:
##  $ Birth   : num  23 22.1 20.6 19.6 18 15.9 16 17.2 15.7 16.6 ...
##  $ Entrance: int  NA NA NA NA NA NA NA NA NA NA ...

head(dta)

##   Birth Entrance
## 1  23.0       NA
## 2  22.1       NA
## 3  20.6       NA
## 4  19.6       NA
## 5  18.0       NA
## 6  15.9       NA

dta$Year<-c(1981:2009)
dta$College<-dta$Entrance-40 # to align the left y-axis 
dta<-dta[ ,-2]

df1<-melt(dta, id="Year")

names(df1)<-c("Year", "Group", "Value")

p<-ggplot(data=df1, aes(x=Year, y=Value))+
  geom_point(aes(shape=Group))+
  # second y-axis: sec.axis = sec_axis(~ 2*., name="data2")
  scale_y_continuous(name="Birth rate (0.1%)", limits=c(0, 60), breaks=seq(0, 60, 10), 
                     sec.axis = sec_axis(~ .+40, name="Acceptance rate (%)", breaks=seq(40, 100,10 )))+
  
  scale_shape_manual(values=c(1,19))+
  theme_linedraw()+
  scale_x_continuous(expand = c(0, 0), breaks=seq(1980, 2010,5), limits=c(1980, 2010))+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(), 
        legend.position=c(0,1),
        legend.justification=c(0,1),
        legend.background = element_rect(fill="white", colour="black", size=0.5),
        legend.title = element_blank(),
        axis.title.y.right = element_text(angle = 90, hjust = 0.5)
        
        
        ) +
  geom_vline(xintercept=seq(1980, 2010,1), colour="gray", linetype='dashed')
p

Exercise

Exercise 1

Use the dataset containing the average number of visitors (monthly) in New Zealand by country of residence to explore the seasonal patterns between the eight countries. Is there a hemisphere effect?

library(magrittr)
library(dplyr)
library(tidyr)
library(ggseas)

dta<-read.csv("C:/Users/USER/Desktop/R_data management/0518/nz_visitors.csv", header=T)%>%
  mutate(month=as.numeric(substr(.$Month, 6,7)),
         year=as.numeric(substr(.$Month, 1,4)), 
         Season=c("Q3", rep(c(rep("Q4",3), rep("Q1",3), rep("Q2", 3), 
                              rep("Q3",3)), (length(.$Month)-2)/12), "Q4"))
str(dta)

## 'data.frame':    170 obs. of  13 variables:
##  $ Month    : Factor w/ 170 levels "1998M09","1998M10",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Australia: int  17244 18090 16750 25909 27228 19461 19200 19595 12564 10987 ...
##  $ ChinaPRof: int  748 941 1054 1270 1375 1660 1456 1488 1449 1413 ...
##  $ Japan    : int  6093 5039 6112 6670 6008 7478 7341 5030 4275 3758 ...
##  $ Korea    : int  979 1083 1144 1836 2716 2245 1611 1416 1192 1255 ...
##  $ Germany  : int  1320 2459 5195 5499 6430 7320 6094 3157 1450 765 ...
##  $ UK       : int  5794 7876 13362 20238 22557 27477 20187 13448 8587 7271 ...
##  $ Canada   : int  973 1418 2236 2935 3623 4394 3573 1795 1160 903 ...
##  $ USA      : int  3837 6093 8468 7865 10007 12533 10519 6737 4932 4845 ...
##  $ Total    : int  57930 68203 84370 113853 122130 124305 104106 83414 59800 52426 ...
##  $ month    : num  9 10 11 12 1 2 3 4 5 6 ...
##  $ year     : num  1998 1998 1998 1998 1999 ...
##  $ Season   : chr  "Q3" "Q4" "Q4" "Q4" ...

tail(dta)

##       Month Australia ChinaPRof Japan Korea Germany   UK Canada  USA  Total
## 165 2012M05     20427      7737  2485  2148    6580 8074   2110 7030  99726
## 166 2012M06     21778      6670  2332  1961    5080 6323   1228 6759  89954
## 167 2012M07     29769      7962  3045  2906    4255 6400   1426 6459 103420
## 168 2012M08     26851      8523  5069  3288    3949 7026   1311 5293 100307
## 169 2012M09     30013      7522  3200  2217    4431 6390   1213 5252  94542
## 170 2012M10     27599      8535  2863  2270    6446 9431   1940 6911 102982
##     month year Season
## 165     5 2012     Q2
## 166     6 2012     Q2
## 167     7 2012     Q3
## 168     8 2012     Q3
## 169     9 2012     Q3
## 170    10 2012     Q4

df1<-gather(dta, key = "country", value = "visits", Australia, ChinaPRof, Japan, Korea, Germany, UK, Canada, USA)
df2<-df1%>%group_by(year, Season, country)%>%summarise(count.v=sum(visits))%>%as.data.frame()
ggplot(df2, aes(x=year, y=count.v/1000, group=country))+geom_line(aes(color=country))+
  labs(x="Year", y="average number of visitors (*1,000)")+
  facet_wrap(~Season)+
  theme_bw()

# boxplot
ggplot(df2, aes(x = Season, y = count.v/10000, fill = country)) + 
  # Specify the geom to be boxplot
  geom_boxplot() +                
  # Add a red line to the mean
  stat_summary(aes(ymax = ..y.., ymin = ..y..),
               fun = "mean", 
               geom = "errorbar",              # Use geom_errorbar to add line as mean
               color = "red", 
               width = 0.7, 
               position = position_dodge(width = 0.75), # Add the line to each group
               show.legend = FALSE)+facet_wrap(.~country)+labs(x="Seasons", y="average number of visitors (*10,000)")

# decomposed 
ts0 <- ts(dta, 
              start=c(1998, 9), 
              end=c(2012, 10), frequency=12)
ts.df<-tsdf(ts0)%>%gather(., key = "country", value = "visits", Australia, ChinaPRof, Japan, Korea, Germany, UK, Canada, USA)

p1<-ggsdc(data=ts.df, aes(x = x, y = visits, colour = country), 
      frequency = 12, method = "seas", start = c(1998, 9),
      facet.titles = c("Observed", "trend",
                       "seasonal", "random")) +
     
  geom_line() +
  labs(colour = "", x = "Time") +
  scale_y_continuous("Number of visitors") +
  ggtitle("seasonal patterns") 
p1

Conclusion: Average number of visitors had seasonal effect. New Zealand had higher average number of visitors in summer (Q1).

Exercise 2

Use the sample data set to estimate the mean life expectancy of Nobel prize winners.

library(dplyr)
library(lubridate)
library(ggplot2)
library(ggalt)

dta<-read.table("C:/Users/USER/Desktop/R_data management/0518/nobel_lspan.txt", header=T)%>%
    mutate(born=mdy(Born), 
         died=mdy(Died), 
         span=born %--% died ,
         y=as.duration(span) / dyears(1)) 
ggplot(dta, aes(reorder(ID, died),  
                x=born, 
                xend=died)) +
  geom_dumbbell(size=rel(1.1), 
                colour="gold", 
                colour_x="goldenrod",
                colour_xend="goldenrod", 
                dot_guide=TRUE, 
                dot_guide_size=0.2) +
  labs(x="Year", y=NULL) + 
  theme_ipsum()

## mean of life expectancy plot
dta%>%summarise(m.le=mean(y))

##       m.le
## 1 82.33878

Exercise 3

Use the following sample of records for profit made, arrival date, and departure date of group travel booked at a travel agency in Taiwan to estimate the mean profit per day of service.

dta<-read.csv("C:/Users/USER/Desktop/R_data management/0518/Visit_TW.csv", header=T)
str(dta)

## 'data.frame':    96 obs. of  3 variables:
##  $ Expense : int  15393 27616 8876 57378 32613 46998 10744 3269 16195 55842 ...
##  $ Arrival : Factor w/ 83 levels "2014/10/10","2014/10/13",..: 79 83 78 74 75 31 31 26 29 77 ...
##  $ Depature: Factor w/ 79 levels "2014/10/1","2014/10/13",..: 73 77 73 76 75 32 69 26 28 73 ...

head(dta)

##   Expense    Arrival   Depature
## 1   15393  2015/2/16  2015/2/17
## 2   27616   2015/3/6  2015/3/11
## 3    8876  2015/2/14  2015/2/17
## 4   57378  2015/1/30   2015/2/9
## 5   32613  2015/1/31   2015/2/6
## 6   46998 2014/12/27 2014/12/31

dta%>%mutate(A=as.Date(Arrival),
             D=as.Date(Depature),
             Stay=as.numeric(difftime(D, A, unit="days"))+1)%>%
  summarise(revenue.mean=sum(Expense)/sum(Stay))

##   revenue.mean
## 1     5522.453

Exercise 4

The following rather awful plot is shown on a web page hosted by the Taiwanese Ministry of Education

…

Revise it so that it is a proper time series plot. For your convenience, the data points have been extracted and saved here . What had happened in the early 1990’s and how do we know if the trend reversal is real? You may want to augment the data set with further data points from 2012 to 2018 available in the foreign students in the U.S. data file.

library(dplyr)
library(tidyverse)
library(ggplot2)

dta<-read.table("C:/Users/USER/Desktop/R_data management/0518/tw_to_us.txt")
dta2<-read.table("C:/Users/USER/Desktop/R_data management/0518/foreign_students_us.txt", header=T)
str(dta)

## 'data.frame':    23 obs. of  1 variable:
##  $ V1: int  3637 2553 4564 6780 12029 12250 17560 22590 30960 33530 ...

head(dta)

dta$Year<-c(seq(1950, 1990,5), seq(1991,2004,1))
colnames(dta)<-c("Number", "Year")
d2<-subset(dta2, Country=="TW")%>%select(Number, Year)

# fill na value 
dt<-c(seq(1950, 1990,1))
dt<-data.frame(dt[-seq(1, 41,5)], rep(NA, 32))
colnames(dt)<-c("Year", "Number")
dt1<-data.frame(c(seq(2005,2011,1)), rep(NA, 7))
colnames(dt1)<-c("Year", "Number")

df1<-rbind(dta, dt, dt1, d2)
df1<-df1[order(df1$Year),]
ggplot()+
  
  geom_point(data=df1, aes(x=Year, y=Number))+
  geom_smooth(data=df1,aes(x=Year, y=Number), method = "loess",  se = FALSE, col = "blue")+
  theme_classic()+
  ggtitle("ROC (Taiwan) Students in the U.S.A (1950-2018)")+
  scale_x_continuous(limits=c(1950, 2018), breaks=seq(1950,2018,5))+
  scale_y_continuous(limits=c(0, 40000), breaks=seq(0, 40000, 5000))

Forecast

library(ggfortify)
library(imputeTS)
library(forecast)
library(tsoutliers)
d2.t<-d2[ ,-2]
d2.ts<-ts(d2.t, start=c(2012,1), frequency=1)

df2<-df1[ ,-2]
df.ts<-ts(df2, start=c(1950,1), frequency=1)
d2.t<-d2[ ,-2]
d2.ts<-ts(d2.t, start=c(2012,1), frequency=1)
df3<-rbind(dta, dt)
df3<-df3[order(df3$Year),]
df3<-df3[ ,-2]
df3.ts<-ts(df3, start=c(1950,1), frequency=1)

# ## na.imputation 
imp <- na_kalman(df3.ts)

df.3f<-df3.ts %>%
  tsclean() %>%
  ets() %>%
  forecast(h=50)


autoplot(df.3f) +
  autolayer(d2.ts, series="Data") +
  autolayer(df.3f$mean, series="Forecasts")

Exercise 5

How different groups spend their day is an article published in The New York Times using the data collected from The American Time Use Survey. Discuss what we need to have in order to replicate this piece of graphical journalism in Taiwan.

…

We need to collect subjects’ (1)gender, (2)employment status, (3)ethnicity, (4)age, (5)education level, (6)how many child they have and (8) activity, (9) activity initiating time and (10) activity end time. Then turn the data to time series data, calculate percentage of each activity in each time period.

Date& Time in class exercise and HW

Date& Time in class exercise and HW

In Class Exercise

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5