Date& Time in class exercise and HW
In Class Exercise
Exercise 1
Find out the number of days you have spent at NCKU as a registered student or staff person.
## [1] 1239
Exercise 2
Reproduce the plot of calls for police assistances around 24 hours in New York City using the data set here.…
library(ggplot2)
library(dplyr)
library(hrbrthemes)
dta<-read.csv("C:/Users/USER/Desktop/R_data management/0518/calls_nyc.csv", header=T)
head(dta)## Hour Calls
## 1 0.5 1080
## 2 1.5 910
## 3 2.5 770
## 4 3.5 780
## 5 4.5 380
## 6 5.5 390
p0<-ggplot(rt, aes(Hour,
calls,
group=1)) +
geom_bar(width=1,
stat="identity",
fill="cyan",
col="gray",
alpha=0.2) +
geom_abline(intercept=mean(dta$Calls),
slope=0,
col="pink", size=1.5) +
coord_polar(theta="x",
start=-pi/24) +
theme_ipsum()+
labs(x="", y="")
p0+labs(x="Hour", y="Calls")+
theme(axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=0.5),
axis.ticks.y=element_line(colour = "black"),
axis.title.x=element_text(face = "bold", colour = "Black", size = 15, hjust = 0.5),
axis.title.y=element_text(face = "bold", colour = "Black", size = 15, hjust = 0.5),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank())+
scale_x_continuous(breaks=seq(0, 23,1)-0.5,
labels=c("0",c(rep("", 4)), "5",c(rep("", 4)), "10",c(rep("", 4)),
"15",c(rep("", 4)), "20", c(rep("", 3))))Exercise 3
Assume that a friend of yours will live to be 100 years old. Find out how often his or her birthday falls on each day of the week. Plot it.
library(lubridate)
m1<-as.Date("1992/03/14")
m2<-as.Date(36525, origin=m1)
bday<-seq(m1, m2, by="days")
daily<-data.frame(table(weekdays(bday)))
daily$Var1 <- factor(daily$Var1, levels=c("星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"))
head(daily)## Var1 Freq
## 1 星期一 5218
## 2 星期二 5218
## 3 星期三 5218
## 4 星期五 5218
## 5 星期六 5218
## 6 星期日 5218
ggplot(daily, aes(x=Var1, y = Freq)) +
geom_point(size = 8) +
geom_segment(aes(x=Var1, y = 0, xend = Var1, yend = Freq), size = 1.1) +
theme_classic() +
geom_hline(yintercept = 0) +
geom_text(aes(label = substr(levels(Var1), 3,3)), colour = "white", nudge_y = 0.1)+
geom_text(aes(label = "5218 \n"), colour = "black", nudge_y = 0.1, lineheight=2.2)+
labs(y = "Frequency", x="Weekdays") +
# coord_flip() +
theme(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())Exercise 4
Reproduce the plot of fertility rate and college acceptance rate in Taiwan from 1981 to 2009 using the data set here.…
library(reshape)
dta<-read.table("C:/Users/USER/Desktop/R_data management/0518/birth_college.txt", header=T)
str(dta)## 'data.frame': 29 obs. of 2 variables:
## $ Birth : num 23 22.1 20.6 19.6 18 15.9 16 17.2 15.7 16.6 ...
## $ Entrance: int NA NA NA NA NA NA NA NA NA NA ...
## Birth Entrance
## 1 23.0 NA
## 2 22.1 NA
## 3 20.6 NA
## 4 19.6 NA
## 5 18.0 NA
## 6 15.9 NA
dta$Year<-c(1981:2009)
dta$College<-dta$Entrance-40 # to align the left y-axis
dta<-dta[ ,-2]
df1<-melt(dta, id="Year")
names(df1)<-c("Year", "Group", "Value")p<-ggplot(data=df1, aes(x=Year, y=Value))+
geom_point(aes(shape=Group))+
# second y-axis: sec.axis = sec_axis(~ 2*., name="data2")
scale_y_continuous(name="Birth rate (0.1%)", limits=c(0, 60), breaks=seq(0, 60, 10),
sec.axis = sec_axis(~ .+40, name="Acceptance rate (%)", breaks=seq(40, 100,10 )))+
scale_shape_manual(values=c(1,19))+
theme_linedraw()+
scale_x_continuous(expand = c(0, 0), breaks=seq(1980, 2010,5), limits=c(1980, 2010))+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
legend.position=c(0,1),
legend.justification=c(0,1),
legend.background = element_rect(fill="white", colour="black", size=0.5),
legend.title = element_blank(),
axis.title.y.right = element_text(angle = 90, hjust = 0.5)
) +
geom_vline(xintercept=seq(1980, 2010,1), colour="gray", linetype='dashed')
pExercise
Exercise 1
Use the dataset containing the average number of visitors (monthly) in New Zealand by country of residence to explore the seasonal patterns between the eight countries. Is there a hemisphere effect?
library(magrittr)
library(dplyr)
library(tidyr)
library(ggseas)
dta<-read.csv("C:/Users/USER/Desktop/R_data management/0518/nz_visitors.csv", header=T)%>%
mutate(month=as.numeric(substr(.$Month, 6,7)),
year=as.numeric(substr(.$Month, 1,4)),
Season=c("Q3", rep(c(rep("Q4",3), rep("Q1",3), rep("Q2", 3),
rep("Q3",3)), (length(.$Month)-2)/12), "Q4"))
str(dta)## 'data.frame': 170 obs. of 13 variables:
## $ Month : Factor w/ 170 levels "1998M09","1998M10",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Australia: int 17244 18090 16750 25909 27228 19461 19200 19595 12564 10987 ...
## $ ChinaPRof: int 748 941 1054 1270 1375 1660 1456 1488 1449 1413 ...
## $ Japan : int 6093 5039 6112 6670 6008 7478 7341 5030 4275 3758 ...
## $ Korea : int 979 1083 1144 1836 2716 2245 1611 1416 1192 1255 ...
## $ Germany : int 1320 2459 5195 5499 6430 7320 6094 3157 1450 765 ...
## $ UK : int 5794 7876 13362 20238 22557 27477 20187 13448 8587 7271 ...
## $ Canada : int 973 1418 2236 2935 3623 4394 3573 1795 1160 903 ...
## $ USA : int 3837 6093 8468 7865 10007 12533 10519 6737 4932 4845 ...
## $ Total : int 57930 68203 84370 113853 122130 124305 104106 83414 59800 52426 ...
## $ month : num 9 10 11 12 1 2 3 4 5 6 ...
## $ year : num 1998 1998 1998 1998 1999 ...
## $ Season : chr "Q3" "Q4" "Q4" "Q4" ...
## Month Australia ChinaPRof Japan Korea Germany UK Canada USA Total
## 165 2012M05 20427 7737 2485 2148 6580 8074 2110 7030 99726
## 166 2012M06 21778 6670 2332 1961 5080 6323 1228 6759 89954
## 167 2012M07 29769 7962 3045 2906 4255 6400 1426 6459 103420
## 168 2012M08 26851 8523 5069 3288 3949 7026 1311 5293 100307
## 169 2012M09 30013 7522 3200 2217 4431 6390 1213 5252 94542
## 170 2012M10 27599 8535 2863 2270 6446 9431 1940 6911 102982
## month year Season
## 165 5 2012 Q2
## 166 6 2012 Q2
## 167 7 2012 Q3
## 168 8 2012 Q3
## 169 9 2012 Q3
## 170 10 2012 Q4
df1<-gather(dta, key = "country", value = "visits", Australia, ChinaPRof, Japan, Korea, Germany, UK, Canada, USA)
df2<-df1%>%group_by(year, Season, country)%>%summarise(count.v=sum(visits))%>%as.data.frame()
ggplot(df2, aes(x=year, y=count.v/1000, group=country))+geom_line(aes(color=country))+
labs(x="Year", y="average number of visitors (*1,000)")+
facet_wrap(~Season)+
theme_bw()# boxplot
ggplot(df2, aes(x = Season, y = count.v/10000, fill = country)) +
# Specify the geom to be boxplot
geom_boxplot() +
# Add a red line to the mean
stat_summary(aes(ymax = ..y.., ymin = ..y..),
fun = "mean",
geom = "errorbar", # Use geom_errorbar to add line as mean
color = "red",
width = 0.7,
position = position_dodge(width = 0.75), # Add the line to each group
show.legend = FALSE)+facet_wrap(.~country)+labs(x="Seasons", y="average number of visitors (*10,000)")# decomposed
ts0 <- ts(dta,
start=c(1998, 9),
end=c(2012, 10), frequency=12)
ts.df<-tsdf(ts0)%>%gather(., key = "country", value = "visits", Australia, ChinaPRof, Japan, Korea, Germany, UK, Canada, USA)
p1<-ggsdc(data=ts.df, aes(x = x, y = visits, colour = country),
frequency = 12, method = "seas", start = c(1998, 9),
facet.titles = c("Observed", "trend",
"seasonal", "random")) +
geom_line() +
labs(colour = "", x = "Time") +
scale_y_continuous("Number of visitors") +
ggtitle("seasonal patterns")
p1 Conclusion: Average number of visitors had seasonal effect. New Zealand had higher average number of visitors in summer (Q1).
Exercise 2
Use the sample data set to estimate the mean life expectancy of Nobel prize winners.
library(dplyr)
library(lubridate)
library(ggplot2)
library(ggalt)
dta<-read.table("C:/Users/USER/Desktop/R_data management/0518/nobel_lspan.txt", header=T)%>%
mutate(born=mdy(Born),
died=mdy(Died),
span=born %--% died ,
y=as.duration(span) / dyears(1))
ggplot(dta, aes(reorder(ID, died),
x=born,
xend=died)) +
geom_dumbbell(size=rel(1.1),
colour="gold",
colour_x="goldenrod",
colour_xend="goldenrod",
dot_guide=TRUE,
dot_guide_size=0.2) +
labs(x="Year", y=NULL) +
theme_ipsum()## m.le
## 1 82.33878
Exercise 3
Use the following sample of records for profit made, arrival date, and departure date of group travel booked at a travel agency in Taiwan to estimate the mean profit per day of service.
## 'data.frame': 96 obs. of 3 variables:
## $ Expense : int 15393 27616 8876 57378 32613 46998 10744 3269 16195 55842 ...
## $ Arrival : Factor w/ 83 levels "2014/10/10","2014/10/13",..: 79 83 78 74 75 31 31 26 29 77 ...
## $ Depature: Factor w/ 79 levels "2014/10/1","2014/10/13",..: 73 77 73 76 75 32 69 26 28 73 ...
## Expense Arrival Depature
## 1 15393 2015/2/16 2015/2/17
## 2 27616 2015/3/6 2015/3/11
## 3 8876 2015/2/14 2015/2/17
## 4 57378 2015/1/30 2015/2/9
## 5 32613 2015/1/31 2015/2/6
## 6 46998 2014/12/27 2014/12/31
dta%>%mutate(A=as.Date(Arrival),
D=as.Date(Depature),
Stay=as.numeric(difftime(D, A, unit="days"))+1)%>%
summarise(revenue.mean=sum(Expense)/sum(Stay))## revenue.mean
## 1 5522.453
Exercise 4
The following rather awful plot is shown on a web page hosted by the Taiwanese Ministry of Education…
Revise it so that it is a proper time series plot. For your convenience, the data points have been extracted and saved here . What had happened in the early 1990’s and how do we know if the trend reversal is real? You may want to augment the data set with further data points from 2012 to 2018 available in the foreign students in the U.S. data file.
library(dplyr)
library(tidyverse)
library(ggplot2)
dta<-read.table("C:/Users/USER/Desktop/R_data management/0518/tw_to_us.txt")
dta2<-read.table("C:/Users/USER/Desktop/R_data management/0518/foreign_students_us.txt", header=T)
str(dta)## 'data.frame': 23 obs. of 1 variable:
## $ V1: int 3637 2553 4564 6780 12029 12250 17560 22590 30960 33530 ...
## V1
## 1 3637
## 2 2553
## 3 4564
## 4 6780
## 5 12029
## 6 12250
dta$Year<-c(seq(1950, 1990,5), seq(1991,2004,1))
colnames(dta)<-c("Number", "Year")
d2<-subset(dta2, Country=="TW")%>%select(Number, Year)
# fill na value
dt<-c(seq(1950, 1990,1))
dt<-data.frame(dt[-seq(1, 41,5)], rep(NA, 32))
colnames(dt)<-c("Year", "Number")
dt1<-data.frame(c(seq(2005,2011,1)), rep(NA, 7))
colnames(dt1)<-c("Year", "Number")
df1<-rbind(dta, dt, dt1, d2)
df1<-df1[order(df1$Year),]
ggplot()+
geom_point(data=df1, aes(x=Year, y=Number))+
geom_smooth(data=df1,aes(x=Year, y=Number), method = "loess", se = FALSE, col = "blue")+
theme_classic()+
ggtitle("ROC (Taiwan) Students in the U.S.A (1950-2018)")+
scale_x_continuous(limits=c(1950, 2018), breaks=seq(1950,2018,5))+
scale_y_continuous(limits=c(0, 40000), breaks=seq(0, 40000, 5000))Forecast
library(ggfortify)
library(imputeTS)
library(forecast)
library(tsoutliers)
d2.t<-d2[ ,-2]
d2.ts<-ts(d2.t, start=c(2012,1), frequency=1)
df2<-df1[ ,-2]
df.ts<-ts(df2, start=c(1950,1), frequency=1)
d2.t<-d2[ ,-2]
d2.ts<-ts(d2.t, start=c(2012,1), frequency=1)
df3<-rbind(dta, dt)
df3<-df3[order(df3$Year),]
df3<-df3[ ,-2]
df3.ts<-ts(df3, start=c(1950,1), frequency=1)
# ## na.imputation
imp <- na_kalman(df3.ts)
df.3f<-df3.ts %>%
tsclean() %>%
ets() %>%
forecast(h=50)
autoplot(df.3f) +
autolayer(d2.ts, series="Data") +
autolayer(df.3f$mean, series="Forecasts")Exercise 5
How different groups spend their day is an article published in The New York Times using the data collected from The American Time Use Survey. Discuss what we need to have in order to replicate this piece of graphical journalism in Taiwan.…
We need to collect subjects’ (1)gender, (2)employment status, (3)ethnicity, (4)age, (5)education level, (6)how many child they have and (8) activity, (9) activity initiating time and (10) activity end time. Then turn the data to time series data, calculate percentage of each activity in each time period.