本文档说明:
帮助:
Course Learning Objectives:
基本上就是课程中例子的简化版本!
library(swirl)
install_course("The R Programming Environment")
R provides a common API (a common set of commands) for interacting with files, that way your code will work across different kinds of computers.
getwd(), ls(), dir(), list.files(), dir.create("testdir"), file.create("mytest.R"), file.exists("mytest.R"), file.exists("mytest.R"), file.rename("mytest.R", "mytest2.R"), file.copy(), unlink("testdir", recursive = TRUE)
文件操作语句
library(titanic)
library(faraway)
Whenever you’re working with a new dataset, the first thing you should do is look at it! What is the format of the data? What are the dimensions? What are the variable names? How are the variables stored? Are there missing data? Are there any flaws in the data?
常用命令
object.size(plants)
a tidy dataset has the following properties:
Each variable forms a column.
Each observation forms a row.
Each type of observational unit forms a table.
library(readr)
getwd()
teams <- read_csv("data/team_standings.csv")
teams
teams <- read_csv("data/team_standings.csv", col_types = "cc") # Here "cc" indicates that the first column ischaracter and the second column is character (there are only two columns).
这个部分有点难,我需要找到一个例子,完整的代码。 https://www.coursera.org/learn/r-programming-environment/supplement/VFcjM/requesting-data-through-a-web-api
同样介绍了一个读入html, xml, json文件格式的读入包。
这一章节提供了,相当重要的内容,我却没有认真去学习它
stringr
package 中有很多的函数包括 `str_exact, str_order, str_pad, str_to_title, str_trim, str_wrap, word’
有些内存管理函数还挺有意思的, 比如mem_used
library(magrittr)
sapply(ls(), function(x) object_size(get(x))) %>% sort %>% tail(5)
mem_change(rm(check_tracks, denver, b))
Package data.table
是非常块的,和 package readr
的函数谁更快呢?
library(data.table)
brazil_zika <- fread("data/COES_Microcephaly-2016-06-25.csv")
fread("data/COES_Microcephaly-2016-06-25.csv",
select = c("location", "value", "unit")) %>%
dplyr::slice(1:3)
答案: 1. 0.003960 2. OC CSN Unadjusted PM2.5 LC TOT 3. State 39 County 081 Site 0017 4. 0.018567 5. 0.4300 6. 3527
先读入数据和查看基本性质。
library(readr)
dat <- read_csv("/media/ghy/36D2072ED206F243/coursera/R_programing_development/course1/data/data/daily_SPEC_2014.csv.bz2");dat1 <- dat
## 用 read.csv读入的数据有280MB, 并且变量名字不一样,而read_csv 有447MB
# dat <- read.csv("/media/ghy/36D2072ED206F243/coursera/R_programing_development/course1/data/data/daily_SPEC_2014.csv.bz2")
str(dat)
head(dat)
dim(dat)
names(dat)
object.size(dat)
## mmap'd region has EOF at the end 出现错误!!
# library(data.table)
# dat <- fread("/media/ghy/36D2072ED206F243/coursera/R_programing_development/course1/data/data/daily_SPEC_2014.csv.bz2")
dat1 <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
select(Parameter.Name, Arithmetic.Mean, State.Name, State.Code, County.Code, Site.Num)
分析各个变量
table(dat$`Parameter Name`) %>% head
table(dat$`State Code`) %>% head
head(data)
head(data); tail(data)
class(dat)
sapply(data, class)
names(data)
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Parameter.Name == "Bromine PM2.5 LC") %>%
filter(State.Name == "Wisconsin") %>%
(function(dat) dat[, sapply(dat, class) == "numeric"]) %>%
select(Arithmetic.Mean) %>%
as.data.frame()
mean(data$Arithmetic.Mean)
sum(is.na(data))
time points, monitoring sites, states 分别对应哪几个变量呢?
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
select(Arithmetic.Mean, State.Name, Site.Num, Parameter.Name) %>%
group_by(Parameter.Name) %>%
summarise(Means = mean(Arithmetic.Mean)) %>%
arrange(Means)
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Parameter.Name == "Sulfate PM2.5 LC") %>%
select(Arithmetic.Mean, State.Code, Site.Num, Parameter.Name, County.Code) %>%
arrange(Arithmetic.Mean)
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Parameter.Name == "EC PM2.5 LC TOR") %>%
select(Arithmetic.Mean, State.Name) %>%
group_by(State.Name) %>%
summarise(Means = mean(Arithmetic.Mean)) %>%
filter(State.Name == "California"| State.Name =="Arizona") %>%
as.data.frame()
diff(data$Means)
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Parameter.Name == "OC PM2.5 LC TOR") %>%
filter(Longitude < -100) %>%
summarise(Median = median(Arithmetic.Mean)) %>%
as.data.frame()
第二个数据集合,我们先读入
library(readxl)
dat <- read_excel("/media/ghy/36D2072ED206F243/coursera/R_programing_development/course1/data/data/aqs_sites.xlsx")
names(dat)
head(dat)
sapply(dat, class)
unique(dat$"Local Site Name") %>% length
sapply(dat, function(x) length(unique(x)))
names(dat)[grepl("Site", names(dat))]
intersect(names(dat), names(dat1))
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
select(Site.Number, Land.Use, Location.Setting) %>%
filter(Land.Use == "RESIDENTIAL", Location.Setting == "SUBURBAN") %>%
nrow()
只有第一个数据集合才有 “EC PM2.5 LC TOR” 这个数据。这里很可能需要数据融合。
data <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Longitude >= -100) %>%
filter(Land.Use == "RESIDENTIAL", Location.Setting == "SUBURBAN") %>%
# filter(Local.Site.Name == "EC PM2.5 LC TOR")
data1 <- dat1 %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Parameter.Name== "EC PM2.5 LC TOR") %>%
select(Latitude, Longitude, Arithmetic.Mean)
merge.Data <- merge(data, data1)
mean(merge.Data$Arithmetic.Mean)
sapply(dat, class)
sapply(dat, function(x) sum(grepl("PM2.5", x)))
(dat$`Local Site Name` == "EC PM2.5 LC TOR") %>% na.omit() %>% sum
unique(data1$Site.Num)
不对尝试另外一种思路
先融合数据
names(dat)
dat1.temp <- dat1 %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat})
dat.temp <- dat %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat})
dat.temp <- rename(dat.temp, Site.Num = Site.Number)
dat2 <- merge(dat.temp, dat1.temp)
rm(dat1.temp, dat.temp)
然后在回去做题
data <- dat2 %>%
(function(dat) {names(dat) = gsub(" ", ".", names(dat)); dat}) %>%
filter(Longitude >= -100) %>%
filter(Parameter.Name== "EC PM2.5 LC TOR") %>%
filter(Land.Use == "RESIDENTIAL", Location.Setting == "SUBURBAN")
好了,这题目还是不对,但是我应该绕过去了。