Hsin-Yu Cheng
Update : July 03, 2015
install.packages("swirl")
library(swirl)
install_from_swirl("Getting and Cleaning Data")
swirl()
R.version.string at the R prompt.if(!require(dplyr)){
install.packages("dplyr")
library(dplyr)
}
library(tidyr)
library(ggplot2)
library(scales)
## set criteria applying to each code chunk.
knitr::opts_chunk$set(comment = NA, message = F)
# Load data
data("iris")
# Converts data to tbl class. tbl's are easier to examine than data frames. R displays only the data that fits onscreen.
tbl_df(iris)
Source: local data frame [150 x 5]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
.. ... ... ... ... ...
str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
names <- gsub("\\.","\\_", names(iris))
names(iris) <- names
head(iris)
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
table(iris$Species)
setosa versicolor virginica
50 50 50
%>% to chain codes together and make them more readable.#select three variables
df1 <- iris %>%
select(Petal_Length, Petal_Width, Species)
head(df1)
Petal_Length Petal_Width Species
1 1.4 0.2 setosa
2 1.4 0.2 setosa
3 1.3 0.2 setosa
4 1.5 0.2 setosa
5 1.4 0.2 setosa
6 1.7 0.4 setosa
# remover a variable
df2 <- iris %>%
select(-Species)
head(df2)
Sepal_Length Sepal_Width Petal_Length Petal_Width
1 5.1 3.5 1.4 0.2
2 4.9 3.0 1.4 0.2
3 4.7 3.2 1.3 0.2
4 4.6 3.1 1.5 0.2
5 5.0 3.6 1.4 0.2
6 5.4 3.9 1.7 0.4
#select the second to fourth variables by using a colon so you do not need to list each variable.
df3 <- iris %>%
select(Sepal_Width:Petal_Width)
head(df3)
Sepal_Width Petal_Length Petal_Width
1 3.5 1.4 0.2
2 3.0 1.4 0.2
3 3.2 1.3 0.2
4 3.1 1.5 0.2
5 3.6 1.4 0.2
6 3.9 1.7 0.4
df4 <- iris %>%
group_by(Species) %>%
summarise(mean_Sepal_Length = mean(Sepal_Length),
mean_Sepal_Width = mean(Sepal_Width),
mean_Petal_Length = mean(Petal_Length),
mean_Petal_Width = mean(Petal_Width))
df4
Source: local data frame [3 x 5]
Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1 setosa 5.006 3.428 1.462
2 versicolor 5.936 2.770 4.260
3 virginica 6.588 2.974 5.552
Variables not shown: mean_Petal_Width (dbl)
gather <- df4 %>% gather(Part, Numbers, -Species) %>%
mutate(Species = as.character(Species),
Part = as.character(Part))
ggplot(gather , aes(x = Species, y = Numbers,fill = as.factor(Part))) +
geom_bar(position="dodge",stat="identity", color = 'black') +
scale_fill_brewer() +
theme_bw() +
guides(fill = guide_legend(title = "Part")) +
theme(legend.position = "top") +
geom_text(aes(label = Numbers), position = position_dodge(width = 0.9), vjust = -0.5, hjust = .5 ,size = 3)
df4 %>% arrange(mean_Sepal_Length)
Source: local data frame [3 x 5]
Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1 setosa 5.006 3.428 1.462
2 versicolor 5.936 2.770 4.260
3 virginica 6.588 2.974 5.552
Variables not shown: mean_Petal_Width (dbl)
## Order rows by values high to low.
df4 %>% arrange(desc(mean_Sepal_Length))
Source: local data frame [3 x 5]
Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1 virginica 6.588 2.974 5.552
2 versicolor 5.936 2.770 4.260
3 setosa 5.006 3.428 1.462
Variables not shown: mean_Petal_Width (dbl)
## a subset of rows that contain two species.
df4 %>% filter(Species == "virginica" | Species == "setosa")
Source: local data frame [2 x 5]
Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1 setosa 5.006 3.428 1.462
2 virginica 6.588 2.974 5.552
Variables not shown: mean_Petal_Width (dbl)
## rename a column name from Speices to New Speices.
df5 <- df4 %>%
filter(Species == "virginica" | Species == "setosa") %>%
arrange(desc(mean_Sepal_Length)) %>%
rename(New_Species = Species)
df5
Source: local data frame [2 x 5]
New_Species mean_Sepal_Length mean_Sepal_Width mean_Petal_Length
1 virginica 6.588 2.974 5.552
2 setosa 5.006 3.428 1.462
Variables not shown: mean_Petal_Width (dbl)
head(iris)
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
## create a new column that fill in the mean of Sepal Length for each species.
Sepal_Length <- iris %>%
group_by(Species) %>%
mutate(Average_Sepal_Length = mean(Sepal_Length),
Above_average = Sepal_Length > Average_Sepal_Length) %>%
select(Species, Sepal_Length, Average_Sepal_Length, Above_average)
head(Sepal_Length)
Source: local data frame [6 x 4]
Groups: Species
Species Sepal_Length Average_Sepal_Length Above_average
1 setosa 5.1 5.006 TRUE
2 setosa 4.9 5.006 FALSE
3 setosa 4.7 5.006 FALSE
4 setosa 4.6 5.006 FALSE
5 setosa 5.0 5.006 FALSE
6 setosa 5.4 5.006 TRUE
## see how many species is above average Sepal Length for each group.
Sepal_Length %>%
group_by(Species, Above_average) %>%
summarise(length(Species))
Source: local data frame [6 x 3]
Groups: Species
Species Above_average length(Species)
1 setosa FALSE 28
2 setosa TRUE 22
3 versicolor FALSE 26
4 versicolor TRUE 24
5 virginica FALSE 28
6 virginica TRUE 22