First set up your libraries.
setwd("C:/Users/tomas/Downloads/eda-course-materials")
library(dslabs)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(reshape2)
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
pf <- read_tsv('C://Users//tomas//Downloads//eda-course-materials//lesson3//pseudo_facebook.tsv')
##
## -- Column specification --------------------------------------------------------
## cols(
## userid = col_double(),
## age = col_double(),
## dob_day = col_double(),
## dob_year = col_double(),
## dob_month = col_double(),
## gender = col_character(),
## tenure = col_double(),
## friend_count = col_double(),
## friendships_initiated = col_double(),
## likes = col_double(),
## likes_received = col_double(),
## mobile_likes = col_double(),
## mobile_likes_received = col_double(),
## www_likes = col_double(),
## www_likes_received = col_double()
## )
You can group_by() multiple variables at the same time, add them to a plot by introducing it to the aes() in the form of color.
pf.fc_by_age_gender <- pf %>% group_by(age, gender) %>% filter(!is.na(gender)) %>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n())
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
ggplot(data=pf.fc_by_age_gender, aes(age, median_friend_count)) +
geom_line(aes(color=gender))
The dcast() function can create new variables from values in a dataframe.
dcast(DATAFRAME, VARIABLE TO KEEP ~ VARIABLE TO SPLIT, value.var=“VARIABLE WITH VALUES FOR THE NEW COLUMNS”). This way we can go from a long data frame to a wide one.
By dividing two variables we can create a ratio.
geom_hline() adds a baseline as a reference by intercepting the y axis.
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = "median_friend_count")
head(pf.fc_by_age_gender.wide)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
ggplot(pf.fc_by_age_gender.wide, aes(age, female/male)) +
geom_line() +
geom_hline(yintercept = 1, lty = "dashed", alpha = 1/2)
This method can be used to add as many variables as needed to a plot.
The function cut(VARIABLE, breaks=CUTS) can be used to create subgroups of a variable.
geom_line(stat=“summary”, fun=FUNCTION) adds a new line with any function of the y variable.
The function round(VARIABLE, RANGE) averages out the data inside a certain range to smooth out a plot.
pf$year_joined <- floor(2014 - (pf$tenure/365))
pf$year_joined.bucket<-cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2013))
ggplot(data = subset(pf, !is.na(year_joined.bucket)),
aes(age, friend_count)) +
geom_line(aes(color=year_joined.bucket), stat = "summary", fun=median) +
geom_line(stat = "summary", fun=mean, lty="dashed")
ggplot(data = subset(pf, tenure>=1),
aes(round(tenure, 5), friendships_initiated/tenure)) +
geom_smooth(aes(color=year_joined.bucket))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
To create a sample you first set a seed with set.seed(# OF SEED).
Then, with the function sample(VECTOR OR VARIABLE, # OF SAMPLES) you create a sample with a number of random elements from your variable.
The function transform(DATAFRAME, NEW VARIABLE=TRANSFORMATION OF OLD VARIABLES) can create new variables by using the existing ones in the dataframe.
To add points to a plot use + geom_point(aes(size = VARIABLE), pch = # FROM 0 TO 25).
yo<-read_csv("C:/Users/tomas/Downloads/eda-course-materials/lesson5/yogurt.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## obs = col_double(),
## id = col_double(),
## time = col_double(),
## strawberry = col_double(),
## blueberry = col_double(),
## pina.colada = col_double(),
## plain = col_double(),
## mixed.berry = col_double(),
## price = col_double()
## )
yo$id<-factor(yo$id)
set.seed(16)
sample.ids<-sample(levels(yo$id), 32)
yo<- transform(yo, all.purchases = (strawberry + blueberry + pina.colada + plain + mixed.berry))
ggplot(data = subset(yo, id %in% sample.ids), aes(time, price)) +
geom_line() +
geom_point(pch=21, aes(size=all.purchases)) +
facet_wrap(~id, ncol = 4)
Sometimes the best way to explore a dataset is to let the relationships between the variables speak for themselves, with a plot matrix we can achieve this.
In the package GGally, the funtion ggpairs(DATAFRAME, ) creates a matrix of plots relating all of the variables in the dataframe.
The sample.int(nrow(DATAFRAME OR VARIABLE), # OF SAMPLES) creates a sample of multiple variables.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
set.seed(16)
pf_subset<-pf %>%
select(age, dob_year, gender, tenure, friend_count, friendships_initiated, likes, mobile_likes)
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (stat_boxplot).
## Warning: Removed 4 rows containing missing values (stat_boxplot).
## Warning: Removed 4 rows containing missing values (stat_boxplot).
## Warning: Removed 4 rows containing missing values (stat_boxplot).
## Warning: Removed 4 rows containing missing values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.