Data Analysis with R

Starting up

Setting your library.

getwd()

## [1] "C:/Users/tomas/Downloads/eda-course-materials"

setwd("C://Users//tomas//Downloads//eda-course-materials")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(dslabs)
library(readr)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(reshape2)

Accesing the data set.

pf <- read_tsv('C://Users//tomas//Downloads//eda-course-materials//lesson3//pseudo_facebook.tsv')

## 
## -- Column specification --------------------------------------------------------
## cols(
##   userid = col_double(),
##   age = col_double(),
##   dob_day = col_double(),
##   dob_year = col_double(),
##   dob_month = col_double(),
##   gender = col_character(),
##   tenure = col_double(),
##   friend_count = col_double(),
##   friendships_initiated = col_double(),
##   likes = col_double(),
##   likes_received = col_double(),
##   mobile_likes = col_double(),
##   mobile_likes_received = col_double(),
##   www_likes = col_double(),
##   www_likes_received = col_double()
## )

Scatterplots

To create a scatterplot, we add the x and y variables to qplot()

qplot(data=pf, x=age, y=friend_count)

ggplot() syntax is more specialized.
ggplot(data=DATAFRAME, aes(x=VARIABLE, y=VARIABLE)). aes stands for aesthetic, in aes you configure the color, size.
Then you can start adding layers:
geom_point() xlim(#,#)

ggplot(data=pf, aes(age, friend_count))+
  geom_point() +
  xlim(13,99)

## Warning: Removed 4012 rows containing missing values (geom_point).

The alpha parameter inside the geom_FUNCTION() can set the transparency. Also geom_jitter() can give us a better view of many dots when they are grouped up.

ggplot(data=pf, aes(age, friend_count))+
  geom_jitter(alpha=1/20) +
  xlim(13,99) + scale_y_continuous(trans = "sqrt")

## Warning: Removed 4284 rows containing missing values (geom_point).

dplyr

Using the pipe (%>%) we can modify dataframes and create new ones.
The group_by() creates a new variable that groups all of the data points, sumarize(NAME=FUNCTION) creates new variables applying functions, and %>% arrange() can organize the data frame by a variable.
geom_line connects the dots of geom_point.

friends_by_age_group <- pf %>% group_by(age) %>% summarise(friends_mean = mean(friend_count), friends_median = median(as.numeric(friend_count)), count=n()) %>% arrange(age)

## `summarise()` ungrouping output (override with `.groups` argument)

  head(friends_by_age_group, 20)

## # A tibble: 20 x 4
##      age friends_mean friends_median count
##    <dbl>        <dbl>          <dbl> <int>
##  1    13         165.           74     484
##  2    14         251.          132    1925
##  3    15         348.          161    2618
##  4    16         352.          172.   3086
##  5    17         350.          156    3283
##  6    18         331.          162    5196
##  7    19         334.          157    4391
##  8    20         283.          135    3769
##  9    21         236.          121    3671
## 10    22         211.          106    3032
## 11    23         203.           93    4404
## 12    24         186.           92    2827
## 13    25         131.           62    3641
## 14    26         144.           75    2815
## 15    27         134.           72    2240
## 16    28         126.           66    2364
## 17    29         121.           66    1936
## 18    30         115.           67.5  1716
## 19    31         118.           63    1694
## 20    32         114.           63    1443

  ggplot(data = friends_by_age_group, aes(age, friends_mean)) + geom_line() + 
    scale_x_continuous(breaks = seq(0,99, 5), limits = c(13,99))

## Warning: Removed 14 row(s) containing missing values (geom_path).

In order to overlay multiple plots together, you can add them as a layer.
stat=“summary” builds new variables into the plot, like the mean function.
geom_quantile(quantiles=PERCENTAGES, method=“rqss”, lambda=0.1) rqss for smoothing. This plot adds the quantiles of the y data.

library(quantreg)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

ggplot(data=pf, aes(age, friend_count))+
  geom_jitter(alpha=1/20, color="orange") +
  coord_trans(xlim=c(13,80), ylim=c(0,1000)) +
  geom_line(stat = "summary", fun.y=mean_se(pf$friend_count)) + 
  geom_quantile(quantiles=c(0.1, 0.9), method="rqss", lambda=0.1, 
                lty="dashed", color="blue") +
  geom_quantile(quantiles=.5, method="rqss", lambda=0.1, color="darkblue")

## Warning: Ignoring unknown parameters: fun.y

## No summary function supplied, defaulting to `mean_se()`

## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)

To further divide the bin, we can create a new data frame with the age by months, and show the correlation with cor.test(VARIABLE 1, VARIABLE 2, method=“pearson, kendall, or spearman”).
The geom_smooth() adds a line with showing the trend of the data.

library(gridExtra)

pf$age_with_months=pf$age+((12-pf$dob_month)/12)

pf.fc_by_age_months<- pf %>% group_by(age_with_months) %>% 
  summarise(friend_count_mean=mean(friend_count), friend_count_median=median(friend_count), n=n()) %>% 
  arrange(age_with_months)

## `summarise()` ungrouping output (override with `.groups` argument)

p1<-ggplot(data=subset(pf.fc_by_age_months, age_with_months<71), 
           aes(age_with_months, friend_count_mean)) + 
  geom_line() +
  geom_smooth()

p2<- ggplot(data=subset(pf, age<71), 
            aes(age, friend_count))+
  geom_jitter(alpha=1/20, color="orange") +
  coord_trans(ylim=c(0,500)) +
  geom_line(stat = "summary", fun.y=mean_se(pf$friend_count)) + 
  geom_quantile(quantiles=c(0.1, 0.9), method="rqss", lambda=0.1, 
                lty="dashed", color="blue") +
  geom_quantile(quantiles=.5, method="rqss", lambda=0.1, color="darkblue") +
  geom_smooth()

## Warning: Ignoring unknown parameters: fun.y

grid.arrange(p1,p2, ncol=1)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## No summary function supplied, defaulting to `mean_se()`

## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

cor.test(pf$age, pf$friend_count)

## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737