Graded out of 100 points. Due on November 3rd. Please typeset your homework, save as an R source code file with title “your student ID-Homework_1.R” (e.g. B024020001- Homework_1.R), and submit to NSYSU Cyber University. DO NOT use any loops in your answers. Also notice that your code must follow the suggested programming and data analysis styles discussed in the class.
data("airquality")
#可以看airquality的資料長什麼樣子
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
#complete.cases是R內建的函式
without_na_rows <- airquality[complete.cases(airquality), ]
#抑或是直接使用na.omit,把na刪去
without_na_rows2 <- na.omit(airquality)
利用sapply
without_na_cols <- airquality[, sapply(airquality, function(x) all(!is.na(x)))]
#可以先來看這一段code
sapply(airquality, function(x) all(!is.na(x)))
## Ozone Solar.R Wind Temp Month Day
## FALSE FALSE TRUE TRUE TRUE TRUE
#is.na會先檢查每一欄的na值,並回傳ture or false,all會檢查是否全部都是false
#若其中一欄有一false會回傳false
#所以without_na_cols會只存剛剛是true的那一欄
不使用迴圈,可用sapply解決 去sum is.na()的個數
sapply(airquality, function(x) sum(is.na(x)))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
使用分權彙整函數 先選出每個月的溫度,算出平均後,合併
兩種選擇
1.aggregate
2.split-apply-combind
airquality$Month <- as.factor(airquality$Month)
#1. aggregate
aggregate(airquality$Temp, by = list(Month = airquality$Month), FUN = mean)
## Month x
## 1 5 65.54839
## 2 6 79.10000
## 3 7 83.90323
## 4 8 83.96774
## 5 9 76.90000
#2. split-apply-combind
month_split <- split(airquality, airquality$Month)
temp_avg <- sapply(month_split, function(x) return(mean(x$Temp)))
temp_avg_df <- data.frame("temp_average" = temp_avg)
temp_avg_df
## temp_average
## 5 65.54839
## 6 79.10000
## 7 83.90323
## 8 83.96774
## 9 76.90000
categorical variables分類變數可以用factor 把wind拆成0-7, 7-14, 14以上,用cut()函數去切 再轉換成factor,Temp以此類推
#0, 7, 14為切點,Inf為infinite無限大,並把0-7為標籤1,7-14為2,14-Inf為3
wind_cat <- as.factor(cut(airquality$Wind, breaks = c(0, 7, 14, Inf), labels = c(1:3)))
temp_cat <- as.factor(cut(airquality$Temp, breaks = c(0, 75, 85, Inf), labels = c(1:3)))
#把airquality_with_2cat把wind_cat跟temp_cat欄位跟airqaulity並在一起
airquality_with_2cat <- cbind(airquality, wind_cat, temp_cat)
diamonds <- data.frame(ggplot2::diamonds)
color_split <- split(diamonds, diamonds$color)
color_price_mean <- sapply(color_split, function(x) return(mean(x$price)))
color_price_mean_df <- data.frame("mean_price" = color_price_mean)
aggregate(diamonds$price, by = list(color = diamonds$color), FUN = mean)
## color x
## 1 D 3169.954
## 2 E 3076.752
## 3 F 3724.886
## 4 G 3999.136
## 5 H 4486.669
## 6 I 5091.875
## 7 J 5323.818
--Compute and list average salaries for each "Number of years in the
--major leagues" (Years) of those who in League A and has hit more than
--100 times
SELECT Years, avg(Salary) FROM Hitters
WHERE League = 'A' AND Hits > 100)
GROUP BY Years
require(ISLR)
## Loading required package: ISLR
## Warning: package 'ISLR' was built under R version 3.4.2
Hitters <- data.frame(ISLR::Hitters)
#先刪除有含NA值的列
Hitters <- Hitters[complete.cases(Hitters),]
#以下為SQL式的轉換
filter_Hitters <- Hitters[(Hitters$League == 'A') & (Hitters$Hits > 100), c('Years', 'Salary')]
aggregate(x = filter_Hitters$Salary, by = list("Years" = filter_Hitters$Years), FUN = mean)
## Years x
## 1 1 127.9000
## 2 2 146.6667
## 3 3 263.3333
## 4 4 327.5000
## 5 5 846.3889
## 6 6 743.1250
## 7 7 839.1667
## 8 8 1012.0000
## 9 9 967.5000
## 10 10 1132.3810
## 11 11 866.6667
## 12 12 763.3333
## 13 13 1175.0000
## 14 14 1143.5767
## 15 15 629.4443
## 16 17 912.5000
## 17 18 655.8335
## 18 20 487.5000
以下為題目範例code
# Split by color
diamonds_split = split(diamonds, diamonds$color)
# Apply, get average price
result = sapply(diamonds_split, function(x) mean(x$price))
# Create a data frame
re = data.frame("color" = names(result), "price" = unname(result))
轉換成SQL
sqldf::sqldf("SELECT color, avg(price) FROM diamonds
GROUP BY color")
## color avg(price)
## 1 D 3169.954
## 2 E 3076.752
## 3 F 3724.886
## 4 G 3999.136
## 5 H 4486.669
## 6 I 5091.875
## 7 J 5323.818
make a function that returns a list of functions.
nth_root = function(x){
function(y){
y ^ (1/x)
}
}
square_root = nth_root(2)
square_root(6:10)
## [1] 2.449490 2.645751 2.828427 3.000000 3.162278
cube_root = nth_root(3)
cube_root(6:10)
## [1] 1.817121 1.912931 2.000000 2.080084 2.154435