Homework 3

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/purchase.csv', 'purchase.csv')

purchase <- read.csv('purchase.csv', header = TRUE)
head(purchase)
##   X                Time Action         User        Product Quantity Price
## 1 0 2015-07-01 00:00:01  order   U312622727    P0006944501        1  1069
## 2 1 2015-07-01 00:00:03  order   U239012343    P0006018073        1  1680
## 3 2 2015-07-01 00:00:19  order U10007697373    P0002267974        1   285
## 4 3 2015-07-01 00:01:10  order   U296328517    P0016144236        1   550
## 5 4 2015-07-01 00:01:36  order   U300884570 P0014516980122        1   249
## 6 5 2015-07-01 00:01:48  order   U451050374    P0004134266        1  1780
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
purchase$Time <- as.POSIXct(purchase$Time)

# Answer1
purchase %>% 
  select(Time, Quantity) %>%
  group_by(dt = format(purchase$Time, '%H')) %>%
  summarise(total_count = sum(Quantity)) %>% 
  arrange(desc(total_count)) %>% head(1)  
## Warning: package 'bindrcpp' was built under R version 3.4.2
## # A tibble: 1 x 2
##      dt total_count
##   <chr>       <int>
## 1    23        6931
# Answer2
buy_stat <- purchase %>% 
  select(Time, Quantity) %>%
  group_by(dt = format(purchase$Time, '%H')) %>%
  summarise(total_count = sum(Quantity))

buy_stat$dt <- as.integer(buy_stat$dt)
plot(total_count ~ dt, data = buy_stat, type = 'b')

head(purchase)
##   X                Time Action         User        Product Quantity Price
## 1 0 2015-07-01 00:00:01  order   U312622727    P0006944501        1  1069
## 2 1 2015-07-01 00:00:03  order   U239012343    P0006018073        1  1680
## 3 2 2015-07-01 00:00:19  order U10007697373    P0002267974        1   285
## 4 3 2015-07-01 00:01:10  order   U296328517    P0016144236        1   550
## 5 4 2015-07-01 00:01:36  order   U300884570 P0014516980122        1   249
## 6 5 2015-07-01 00:01:48  order   U451050374    P0004134266        1  1780
# Answer3
purchase_stat <- purchase %>%
  select(User, Quantity, Price) %>%
  mutate(buy_amount = Quantity * Price) %>%
  group_by(User) %>%
  summarise(total_buy_amount = sum(buy_amount)) %>%
  arrange(desc(total_buy_amount)) %>%
  head(3) 

purchase_stat
## # A tibble: 3 x 2
##          User total_buy_amount
##        <fctr>            <dbl>
## 1  U166708333          2942744
## 2  U142809250           747550
## 3 U1006283751           515688
# Answer4
purchase_stat <- purchase %>%
  select(User, Quantity, Price) %>%
  mutate(buy_amount = Quantity * Price) %>%
  group_by(User) %>%
  summarise(total_buy_amount = sum(buy_amount)) %>%
  arrange(desc(total_buy_amount)) %>%
  head(10) 
purchase_stat
## # A tibble: 10 x 2
##            User total_buy_amount
##          <fctr>            <dbl>
##  1   U166708333          2942744
##  2   U142809250           747550
##  3  U1006283751           515688
##  4 U10114715330           456782
##  5 U10062834851           439657
##  6    U32775842           422396
##  7   U330237576           335390
##  8   U300915168           330322
##  9    U32775853           319535
## 10   U331652356           257210
barplot(purchase_stat$total_buy_amount, names.arg = purchase_stat$User, col="blue")

Explore Anscombe

data("anscombe")
#anscombe
plot(y1 ~ x1, data = anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
plot(y1 ~ x1, data = anscombe)
abline(fit, col = "red")

plot(y2 ~ x2, data = anscombe)

plot(y3 ~ x3, data = anscombe)

plot(y4 ~ x4, data = anscombe)

Line Chart

x <- seq(1,6)
y <- x
plot(y ~ x, type = 'l')

types <- c('p', 'l', 'o', 'b', 'c', 's', 'h' ,'n')
par(mfrow = c(2,4))
for (i in types){
  #print(i)
  plot(y ~ x, type = i, main = i)
}

plot(y ~ x, type = 'n')
lines(y ~ x, col = "blue")


types <- c('p', 'l', 'o', 'b', 'c', 's', 'h' ,'n')
par(mfrow = c(2,4))

for (i in types){
  plot(y ~ x, type = 'n', main = i)
  lines(y ~ x, type = i)
}

par(mfrow =c(1,1))
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)


plot(taipei, type = 'o', main = 'Rainfall by Region', ylab = 'Rainfall', xlab = 'Month', ylim = c(0,220), col= 'blue', lwd=3)

lines(tainan, type = 'o', col ='red', lwd = 3, pch = 22, lty=2)

#barplot(c(80,82,86,88), ylim = c(80,100))
plot(c(80,82,86,88), ylim = c(80,100), type = 'b')

plot(c(80,82,86,88), ylim = c(0,100), type = 'b')

plot(taipei, type = 'o', main = 'Rainfall by Region', ylab = 'Rainfall', xlab = 'Month', ylim = c(0,220), xlim = c(1,6),col= 'blue', lwd=3)
lines(tainan, type = 'o', col ='red', lwd = 3, pch = 22, lty=2)
text(x = 5.5, y= 210, 'Taipei', col='blue', cex = 1.1)
text(x = 5.5, y= 180, 'Tainan', col='red', cex = 1.1)

?text
## starting httpd help server ... done

Bar Chart

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv', 'house-price.csv')

housePrice <- read.csv('house-price.csv', header = TRUE)

bedroomsTable <- table(housePrice$Bedrooms)

barplot(bedroomsTable, main = 'Bedrooms Stat', xlab = 'Bedroom Number', ylab= 'Bedroom Count', col = 'blue')

bedroomsTable
## 
##  2  3  4  5 
## 30 67 29  2
roomType <- as.factor(names(bedroomsTable))

barplot(bedroomsTable, main = 'Bedrooms Stat', xlab = 'Bedroom Number', ylab= 'Bedroom Count', col = roomType)

Histogram

load('cdc.RData')

head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
hist(cdc$weight)

hist(cdc$weight, breaks = 2)

hist(cdc$weight, breaks = 50)

hist(cdc$weight, breaks = 200)

weight_tb <- table(cdc$weight)
head(sort(weight_tb, decreasing = TRUE))
## 
## 160 150 180 170 200 140 
## 992 970 933 922 805 794
weight_tb2 <- table(cdc$weight %% 10)

barplot(weight_tb2)

par(mfrow=c(2,1))
hist(cdc$weight,breaks=50,xlim=c(70,380))
barplot(table(cdc$weight),xlab="weight",ylab="Frequency")

par(mfrow=c(1,1))

Pie Chart

housePrice <- read.csv('house-price.csv', header = TRUE)

bedrooms   <- housePrice$Bedrooms
bedroomsTable=table(bedrooms)
bedroomsTable
## bedrooms
##  2  3  4  5 
## 30 67 29  2
barplot(bedroomsTable)

labels <- c('2 Units', '3 Units', '4 Units', '5 Units')
pie(bedroomsTable, labels = labels, col = rainbow(length(labels)), main = 'Pie Chart By Bedrooms')

bedroomsTable <- sort(bedroomsTable, decreasing = TRUE)
pie(bedroomsTable, init.angle = 90, clockwise = TRUE)

#?pie

Scatter Plot

plot(cdc$weight, cdc$wtdesire)

data(iris)

plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species)

versicolor <- iris[iris$Species == 'versicolor', ]
virginica  <- iris[iris$Species == 'virginica', ]

plot(Petal.Length ~ Petal.Width, data = versicolor, col = 'red', xlim = c(0,3), ylim = c(0,7))

points(Petal.Length ~ Petal.Width, data = virginica, col = 'blue')

points(mean(versicolor$Petal.Width), mean(versicolor$Petal.Length), col = 'orange', pch= 16)

points(mean(virginica$Petal.Width), mean(virginica$Petal.Length), col = 'orange', pch= 16)
abline(v = 1.75, col='green')

fit <- lm(wtdesire ~ weight, data = cdc)

filter.cond <- cdc$wtdesire - cdc$weight < 200
cdc_new <- cdc[filter.cond, ]

fit2 <- lm(wtdesire ~ weight, data = cdc_new)
fit2
## 
## Call:
## lm(formula = wtdesire ~ weight, data = cdc_new)
## 
## Coefficients:
## (Intercept)       weight  
##     46.8825       0.6375
plot(cdc$weight, cdc$wtdesire)
# wtdesire = weight * 0.639 + 46.664
abline(fit, col = 'red')
# wtdesire = weight * 0.6375 + 46.8825
abline(fit2, col = 'blue')

## Mosaic Plot

smoker_gender <- table(cdc$smoke100, cdc$gender)

colnames(smoker_gender)
## [1] "m" "f"
as.factor(colnames(smoker_gender))
## [1] m f
## Levels: f m
mosaicplot(smoker_gender, col = as.factor(colnames(smoker_gender)))

mosaicplot(smoker_gender, col = c('red', 'green'))

BOX Chart

hist(cdc$weight)

boxplot(cdc$weight, ylab = 'Wight')

boxplot(cdc$weight~ cdc$gender, ylab = 'Wight')

boxplot(cdc$height~ cdc$gender, ylab = 'height')

temp <- c(20,40,30,25,28,30,999)
# mean temp
sum(temp) / length(temp)
## [1] 167.4286
mean(temp)
## [1] 167.4286
#median temp
sort(temp, decreasing = FALSE)
## [1]  20  25  28  30  30  40 999
median(temp)
## [1] 30
quantile(temp, 0.25)
##  25% 
## 26.5
quantile(temp, 0.75)
## 75% 
##  35
# IQR
quantile(temp, 0.75) - quantile(temp, 0.25)
## 75% 
## 8.5
IQR(temp)
## [1] 8.5
max(min(temp), median(temp) - 1.5 * IQR(temp) )
## [1] 20
min(max(temp), median(temp) + 1.5 * IQR(temp) )
## [1] 42.75
boxplot(temp)

boxplot(temp[temp<100])

mean(temp[temp<100])
## [1] 28.83333
# House Price

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')

library(readr)
lvr_prices <- read_csv("C:/Users/Administrator/Desktop/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
head(lvr_prices)
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1     0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2     1 中正區 房地(土地+建物)     臺北市中正區忠孝東路二段121~150號
## 3     2 大同區            土地               橋北段二小段601~630地號
## 4     3 大同區 房地(土地+建物)       臺北市大同區重慶北路一段61~90號
## 5     4 內湖區 房地(土地+建物) 臺北市內湖區民權東路六段90巷6弄1~30號
## 6     5 信義區            土地               福德段一小段661~690地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
library(dplyr)
area_price_stat <- lvr_prices %>% 
  select(area, price_per_sqmeter, trading_target, city_land_type) %>%
  filter(city_land_type == '住' & trading_target == '房地(土地+建物)') %>%
  group_by(area ) %>%
  summarise(area_price = mean(price_per_sqmeter, na.rm=TRUE)) %>%
  arrange(desc(area_price))
  
  
barplot(area_price_stat$area_price/0.3025, names.arg = area_price_stat$area, cex.axis = 0.6, cex.names = 0.6)

living_area <- lvr_prices %>% 
  select(area, price_per_sqmeter, trading_target, city_land_type) %>%
  filter(city_land_type == '住' & trading_target == '房地(土地+建物)' & price_per_sqmeter < 5000000)

living_area %>%
  filter(area == '文山區' & price_per_sqmeter < 10000000) %>%
  arrange(desc(price_per_sqmeter))
## # A tibble: 3,812 x 4
##      area price_per_sqmeter  trading_target city_land_type
##     <chr>             <dbl>           <chr>          <chr>
##  1 文山區           1459922 房地(土地+建物)             住
##  2 文山區           1001558 房地(土地+建物)             住
##  3 文山區            602410 房地(土地+建物)             住
##  4 文山區            598097 房地(土地+建物)             住
##  5 文山區            522987 房地(土地+建物)             住
##  6 文山區            518110 房地(土地+建物)             住
##  7 文山區            483793 房地(土地+建物)             住
##  8 文山區            449000 房地(土地+建物)             住
##  9 文山區            418060 房地(土地+建物)             住
## 10 文山區            416181 房地(土地+建物)             住
## # ... with 3,802 more rows
boxplot(price_per_sqmeter ~ area, data = living_area, cex.axis = 0.5, cex.names = 0.5)

boxplot(log(price_per_sqmeter) ~ area, data = living_area, cex.axis = 0.5, cex.names = 0.5)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 5 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 9 is not drawn

Legend

taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot(taipei, type = 'o', main = 'Rainfall by Region', ylab = 'Rainfall', xlab = 'Month', ylim = c(0,220), col= 'blue', lwd=3)

lines(tainan, type = 'o', col ='red', lwd = 3, pch = 22, lty=2)

legend(1,200, c("taipei","tainan"), lwd=c(2.5,2.5),col=c("blue","red"), title="Rainfall")

legend(4,80, c("taipei","tainan"), lwd=c(2.5,2.5),col=c("blue","red"), title="Rainfall")

bedrooms   <- housePrice$Bedrooms
bedroomsTable=table(bedrooms)
bedroomsTable <- sort(bedroomsTable, decreasing = TRUE)
pie(bedroomsTable, init.angle = 90, clockwise = TRUE, col = rainbow(length(names(bedroomsTable))))
legend('bottomright', names(bedroomsTable), fill=rainbow(length(names(bedroomsTable))))

Par

showLayout <- function(n){
  for(i in 1:n){
    plot(1,type="n",xaxt="n",yaxt="n",xlab="",ylab="")
    text(1, 1, labels=i, cex=10)
  }
}

par(mfrow = c(3,2))
showLayout(6)

par(mfrow = c(2,3))
showLayout(6)


par(mfcol = c(3,2))
showLayout(6)

par(mfcol = c(2,3))
showLayout(6)


par(mar=c(1,1,1,1),mfrow=c(3,2))
showLayout(6)
par(mar=c(3,3,3,3),mfrow=c(3,2))
showLayout(6)
par(mar=c(3,3,3,3),mfcol=c(3,2))
showLayout(6)

## Export Plot

png('rainfall.png')

taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)

plot(taipei, type = 'o', main = 'Rainfall by Region', ylab = 'Rainfall', xlab = 'Month', ylim = c(0,220), col= 'blue', lwd=3)

lines(tainan, type = 'o', col ='red', lwd = 3, pch = 22, lty=2)

legend(1,200, c("taipei","tainan"), lwd=c(2.5,2.5),col=c("blue","red"), title="Rainfall")
dev.off()
## png 
##   2

Quantmod

#install.packages('quantmod')
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
getSymbols('AAPL')
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## 
## WARNING: There have been significant changes to Yahoo Finance data.
## Please see the Warning section of '?getSymbols.yahoo' for details.
## 
## This message is shown once per session and may be disabled by setting
## options("getSymbols.yahoo.warning"=FALSE).
## [1] "AAPL"
chartSeries(AAPL)

getMetals('gold')
## [1] "XAUUSD"
chartSeries(XAUUSD)

library(forecast)
## Warning: package 'forecast' was built under R version 3.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
XAUUSD %>% 
   ets() %>%
  forecast(h=30) %>%
  autoplot()

Plotly

library(plotly)
## Warning: package 'plotly' was built under R version 3.4.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
x <- seq(1,100)
random_y <- rnorm(100, mean = 0)
data <- data.frame(x, random_y)

p <- plot_ly(data, x = ~x, y = ~random_y, type = 'scatter', mode = 'lines')
p

## Donut Chart

ds <-data.frame(labels=c("A", "B", "C"),values =c(10, 20, 30))

plot_ly(ds, labels=ds$labels, values =ds$values, type ="pie") %>% layout(title="Pie Chart Example")
plot_ly(ds, labels=ds$labels, values =ds$values, type ="pie", hole=0.6) %>% layout(title="Donut Chart Example")

area chart

library(plotly)
month <- c(1,2,3,4,5)
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)
data <- data.frame(month, taipei, tainan)
p <- plot_ly(data, x = ~month, y = ~taipei, type = 'scatter', mode = 'lines') %>% 
  add_trace(data, x = ~month, y = ~tainan, type = 'scatter', mode = 'lines')

p <- plot_ly(data, x = ~month, y = ~taipei, type = 'scatter', mode = 'lines', fill= 'tozeroy') 

p <- plot_ly(data, x = ~month, y = ~taipei, type = 'scatter', mode = 'markers', fill= 'tozeroy', name= 'Taipei')  %>% add_trace(data, x = ~month, y = ~tainan, type = 'scatter', mode = 'markers', name = 'Tainan') %>% layout(yaxis=list(title='rainfall'))
p
total <-taipei+tainan
data <- data.frame(month, taipei, tainan, total)
y <- list(title="Rainfall")

p <- plot_ly(data, x = ~month, y = ~total, type = 'scatter', mode = 'markers', fill= 'tozeroy', name= 'Taipei')  %>% add_trace(data, x = ~month, y = ~tainan, type = 'scatter', mode = 'markers', name = 'Tainan', fill='tonexty') %>% layout(yaxis=list(title='rainfall')) 
p

## Bubble Chart

data("diamonds")
#diamonds
d <-diamonds[sample(nrow(diamonds), 1000), ]
plot_ly(d, x =d$carat, y=d$price, color =d$clarity, type='scatter', mode='markers', size =d$carat, text=paste("Clarity", d$clarity))

subplots

data('economics')
p <-subplot(
  plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines'),
  plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines'
          ),margin=0.05)

p %>%layout(showlegend=FALSE)
p <-subplot(
  plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines'),
  plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines'
          ),margin=0.05, nrows = 2)

p %>%layout(showlegend=FALSE)

## Map


Sys.setenv('MAPBOX_TOKEN' = '')

df = read.csv('https://raw.githubusercontent.com/bcdunbar/datasets/master/meteorites_subset.csv')

p <- df %>% head(100) %>%
  plot_mapbox(lat = ~reclat, lon = ~reclong,
              split = ~class, size=2,
              mode = 'scattermapbox', hoverinfo='name') %>%
  layout(title = 'Meteorites by Class',
         font = list(color='white'),
         legend = list(orientation = 'h',
                       font = list(size = 8)),
         margin = list(l = 25, r = 25,
                       b = 25, t = 25,
                       pad = 2))

p