Exploring many variables

Starting up

First set up your libraries.

setwd("C:/Users/tomas/Downloads/eda-course-materials")

library(dslabs)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(reshape2)
library(quantreg)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
pf <- read_tsv('C://Users//tomas//Downloads//eda-course-materials//lesson3//pseudo_facebook.tsv')
## 
## -- Column specification --------------------------------------------------------
## cols(
##   userid = col_double(),
##   age = col_double(),
##   dob_day = col_double(),
##   dob_year = col_double(),
##   dob_month = col_double(),
##   gender = col_character(),
##   tenure = col_double(),
##   friend_count = col_double(),
##   friendships_initiated = col_double(),
##   likes = col_double(),
##   likes_received = col_double(),
##   mobile_likes = col_double(),
##   mobile_likes_received = col_double(),
##   www_likes = col_double(),
##   www_likes_received = col_double()
## )

Adding a third variable

You can group_by() multiple variables at the same time, add them to a plot by introducing it to the aes() in the form of color.

pf.fc_by_age_gender <- pf %>% group_by(age, gender) %>% filter(!is.na(gender)) %>%
  summarise(mean_friend_count=mean(friend_count), 
            median_friend_count=median(friend_count), 
            n=n()) 
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
ggplot(data=pf.fc_by_age_gender, aes(age, median_friend_count)) + 
  geom_line(aes(color=gender))

Reshape data

The dcast() function can create new variables from values in a dataframe.
dcast(DATAFRAME, VARIABLE TO KEEP ~ VARIABLE TO SPLIT, value.var=“VARIABLE WITH VALUES FOR THE NEW COLUMNS”). This way we can go from a long data frame to a wide one.
By dividing two variables we can create a ratio.
geom_hline() adds a baseline as a reference by intercepting the y axis.

pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = "median_friend_count")

head(pf.fc_by_age_gender.wide) 
##   age female  male
## 1  13  148.0  55.0
## 2  14  224.0  92.5
## 3  15  276.0 106.5
## 4  16  258.5 136.0
## 5  17  245.5 125.0
## 6  18  243.0 122.0
ggplot(pf.fc_by_age_gender.wide, aes(age, female/male)) + 
  geom_line() + 
  geom_hline(yintercept = 1, lty = "dashed", alpha = 1/2)

This method can be used to add as many variables as needed to a plot.
The function cut(VARIABLE, breaks=CUTS) can be used to create subgroups of a variable.
geom_line(stat=“summary”, fun=FUNCTION) adds a new line with any function of the y variable.
The function round(VARIABLE, RANGE) averages out the data inside a certain range to smooth out a plot.

pf$year_joined <- floor(2014 - (pf$tenure/365))

pf$year_joined.bucket<-cut(pf$year_joined, breaks = c(2004, 2009, 2011, 2012, 2013))

ggplot(data = subset(pf, !is.na(year_joined.bucket)), 
       aes(age, friend_count)) + 
  geom_line(aes(color=year_joined.bucket), stat = "summary", fun=median) +
  geom_line(stat = "summary", fun=mean, lty="dashed")

ggplot(data = subset(pf, tenure>=1), 
       aes(round(tenure, 5), friendships_initiated/tenure)) +
  geom_smooth(aes(color=year_joined.bucket))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Creating a sample

To create a sample you first set a seed with set.seed(# OF SEED).
Then, with the function sample(VECTOR OR VARIABLE, # OF SAMPLES) you create a sample with a number of random elements from your variable.
The function transform(DATAFRAME, NEW VARIABLE=TRANSFORMATION OF OLD VARIABLES) can create new variables by using the existing ones in the dataframe.
To add points to a plot use + geom_point(aes(size = VARIABLE), pch = # FROM 0 TO 25).

yo<-read_csv("C:/Users/tomas/Downloads/eda-course-materials/lesson5/yogurt.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   obs = col_double(),
##   id = col_double(),
##   time = col_double(),
##   strawberry = col_double(),
##   blueberry = col_double(),
##   pina.colada = col_double(),
##   plain = col_double(),
##   mixed.berry = col_double(),
##   price = col_double()
## )
yo$id<-factor(yo$id)

set.seed(16)

sample.ids<-sample(levels(yo$id), 32)

yo<- transform(yo, all.purchases = (strawberry + blueberry + pina.colada + plain + mixed.berry))

ggplot(data = subset(yo, id %in% sample.ids), aes(time, price)) + 
  geom_line() +
  geom_point(pch=21, aes(size=all.purchases)) +
  facet_wrap(~id, ncol = 4)

Sometimes the best way to explore a dataset is to let the relationships between the variables speak for themselves, with a plot matrix we can achieve this.
In the package GGally, the funtion ggpairs(DATAFRAME, ) creates a matrix of plots relating all of the variables in the dataframe.
The sample.int(nrow(DATAFRAME OR VARIABLE), # OF SAMPLES) creates a sample of multiple variables.

library(GGally) 
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
set.seed(16) 

pf_subset<-pf %>% 
  select(age, dob_year, gender, tenure, friend_count, friendships_initiated, likes, mobile_likes) 

ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (stat_boxplot).

## Warning: Removed 4 rows containing missing values (stat_boxplot).

## Warning: Removed 4 rows containing missing values (stat_boxplot).

## Warning: Removed 4 rows containing missing values (stat_boxplot).

## Warning: Removed 4 rows containing missing values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.