What I learn in this course Scatter plots Conditional means Correlation co-efficient

library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv',sep='\t')

scatter plot

qplot(x=age,y=friend_count,data=pf)

#qplot(age,frined_count,data=pf)   #This also works

ggplot syntax

ggplot(aes(x=age,y=friend_count),data=pf)+geom_point()

#ggplot(aes(x=age,y=friend_count),data=pf)+geom_line()
summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
ggplot(aes(x=age,y=friend_count),data=pf)+geom_point()+xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

overplotting

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point(alpha=1/20)+xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_jitter(alpha=1/20)+
  xlim(13,90)
## Warning: Removed 5155 rows containing missing values (geom_point).

coord_trans solution

ggplot(aes(x = age, y = friend_count), data = pf)+
  geom_point(alpha = 1/20, position = position_jitter(h = 0))+
  xlim(13,90)+
  coord_trans(y='sqrt')
## Warning: Removed 5200 rows containing missing values (geom_point).

Friend count vs Age

ggplot(aes(y = friendships_initiated, x = age), data = pf)+geom_point()

ggplot(aes(y = friendships_initiated, x = age), data = pf)+
  geom_point(alpha = 1/20, position = position_jitter(h = 0))+
  xlim(13,90)
## Warning: Removed 5172 rows containing missing values (geom_point).

ggplot(aes(y = friendships_initiated, x = age), data = pf)+
  geom_point(alpha = 1/20, position = position_jitter(h = 0))+
  xlim(13,90)+
  coord_trans(y='sqrt')
## Warning: Removed 5175 rows containing missing values (geom_point).

Cordinal means

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups=group_by(pf,age)
pf.fc_by_age=summarize(age_groups,
                       friend_count_mean=mean(friend_count),
                       friend_count_median=median(friend_count),
                       n=n())
pf.fc_by_age=arrange(pf.fc_by_age,age)
pf.fc_by_age<- pf %>%
              group_by(age) %>%
              summarize(friend_count_mean=mean(friend_count),
                        friend_count_median=median(friend_count),
                        n=n()) %>%
                        arrange(age)
head(pf.fc_by_age, 20)
## # A tibble: 20 × 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
## 1     13          164.7500                74.0   484
## 2     14          251.3901               132.0  1925
## 3     15          347.6921               161.0  2618
## 4     16          351.9371               171.5  3086
## 5     17          350.3006               156.0  3283
## 6     18          331.1663               162.0  5196
## 7     19          333.6921               157.0  4391
## 8     20          283.4991               135.0  3769
## 9     21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
## 11    23          202.8426                93.0  4404
## 12    24          185.7121                92.0  2827
## 13    25          131.0211                62.0  3641
## 14    26          144.0082                75.0  2815
## 15    27          134.1473                72.0  2240
## 16    28          125.8354                66.0  2364
## 17    29          120.8182                66.0  1936
## 18    30          115.2080                67.5  1716
## 19    31          118.4599                63.0  1694
## 20    32          114.2800                63.0  1443

Plot average friend_count and the age

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_line()+
  xlim(13,90)+
  coord_trans(y='sqrt')
## Warning: Removed 23 rows containing missing values (geom_path).

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_line()+
  xlim(13,90)+
  coord_trans(y='log10')
## Warning: Removed 23 rows containing missing values (geom_path).

ggplot(aes(x = friend_count_median, y = friend_count_mean), data = pf.fc_by_age)+geom_point()

ggplot(aes(x = friend_count_median, y = friend_count_mean), data = pf.fc_by_age)+geom_point()

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_line()

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_point()

ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age)+geom_point()

Overlaying summaries with Row data

ggplot(aes(y = friendships_initiated, x = age), data = pf)+
  xlim(13,90)+
  geom_point(alpha = 0.05, position = position_jitter(h = 0),colour='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5173 rows containing missing values (geom_point).

adding quantiles

ggplot(aes(y = friendships_initiated, x = age), data = pf)+
  xlim(13,90)+
  geom_point(alpha = 0.05, position = position_jitter(h = 0),colour='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.9),
            linetype=2,color='blue')+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.1),
            linetype=2,color='blue')+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.5),
            linetype=2,color='red')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5180 rows containing missing values (geom_point).

Limiting and zooming into the data

ggplot(aes(y = friendships_initiated, x = age), data = pf)+
  xlim(13,70)+ylim(0,1000)+
  geom_point(alpha = 0.05, position = position_jitter(h = 0),colour='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.9),
            linetype=2,color='blue')+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.1),
            linetype=2,color='blue')+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.5),
            linetype=2,color='red')
## Warning: Removed 8693 rows containing non-finite values (stat_summary).

## Warning: Removed 8693 rows containing non-finite values (stat_summary).

## Warning: Removed 8693 rows containing non-finite values (stat_summary).

## Warning: Removed 8693 rows containing non-finite values (stat_summary).
## Warning: Removed 9119 rows containing missing values (geom_point).

Correlation methods

# ?cor.test()  # For documentation
cor.test(pf$age,pf$friend_count,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf,cor.test(age,friend_count,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(subset(pf,pf$age<=70),cor.test(age,friend_count,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245
with(subset(pf,pf$age<=70),cor.test(age,friend_count,method='spearman'))
## Warning in cor.test.default(age, friend_count, method = "spearman"): Cannot
## compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2552934

Create a scatterplot of likes_received (y)

vs. www_likes_received (x). Use any of the

techniques that you’ve learned so far to

modify the plot.

ggplot(aes(y = likes_received, x = www_likes_received), data = pf)+
  xlim(13,70)+ylim(0,1000)+
  geom_point(alpha = .05, position = position_jitter(h = 0),colour='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.9),
            linetype=2,color='blue')+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.1),
            linetype=2,color='blue')+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.5),
            linetype=2,color='red')
## Warning: Removed 81243 rows containing non-finite values (stat_summary).

## Warning: Removed 81243 rows containing non-finite values (stat_summary).

## Warning: Removed 81243 rows containing non-finite values (stat_summary).

## Warning: Removed 81243 rows containing non-finite values (stat_summary).
## Warning: Removed 81753 rows containing missing values (geom_point).

cor.test(pf$www_likes_received,pf$likes_received,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
with(pf,cor.test(www_likes_received,likes_received,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
with(subset(pf,pf$age<=70),cor.test(www_likes_received,likes_received,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 928.32, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9504075 0.9516488
## sample estimates:
##      cor 
## 0.951032
with(subset(pf,pf$age<=70),cor.test(www_likes_received,likes_received,method='spearman'))
## Warning in cor.test.default(www_likes_received, likes_received, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  www_likes_received and likes_received
## S = 9.6868e+12, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9229515

simple plot(www_likes_receive vs likes_received)

ggplot(aes(y = likes_received, x = www_likes_received), data = pf)+geom_point()

ggplot(aes(y = likes_received, x = www_likes_received), data = pf)+
  geom_point()+
  xlim(0,quantile(pf$www_likes_received, .95))+
  ylim(0,quantile(pf$likes_received, .95))+
  geom_smooth(method='lm',color='red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

with(pf,cor.test(www_likes_received,likes_received,method='pearson'))   #This is not a surprising tis because one is the subset of the other
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

more caution with correlation

#install.packages('alr3')
library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data(Mitchell)
?Mitchell
head(Mitchell,n=20L)
##    Month     Temp
## 1      0 -5.18333
## 2      1 -1.65000
## 3      2  2.49444
## 4      3 10.40000
## 5      4 14.99440
## 6      5 21.71670
## 7      6 24.74440
## 8      7 24.07220
## 9      8 18.86110
## 10     9  9.16667
## 11    10  1.54444
## 12    11 -4.00556
## 13    12 -7.47778
## 14    13 -4.00556
## 15    14  3.00000
## 16    15 10.96110
## 17    16 18.02220
## 18    17 22.72780
## 19    18 26.26110
## 20    19 21.60560
ggplot(aes(y=Temp,x=Month),data=Mitchell)+
  geom_point()

#qplot(Temp,Month,data=Mitchell)

cor.test(Mitchell$Month,Mitchell$Temp,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Makeing sence of data

ggplot(aes(y=Temp*.1,x=Month),data=Mitchell)+
  geom_point()+scale_x_continuous(breaks = seq(0,204,12))

ggplot(aes(y=Temp*.1,x=Month),data=Mitchell)+
  geom_line()+scale_x_continuous(breaks = seq(0,204,12))

You could also get perspective on this data by overlaying each year’s data on top of each other, giving a clear, generally sinusoidal graph. You #can do this by using the R’s modulus operator %% in your code. Try running the code below!

Note: The nature of the data should suggest the shape of teh graph

ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+ 
  geom_point() 

understanding the noice age to Age months. Understanding the noice in the data.

#pf$age_with_months <- pf$age + (1 - pf$dob_month / 12) 
pf$age_with_months <- with(pf, age + (12 - dob_month / 12))
#pf$age_with_months<-pf$age_with_months*12
pf.fc_by_age_wuth_months<- pf %>%
              group_by(age_with_months) %>%
              summarize(friend_count_mean=mean(friend_count),
                        friend_count_median=median(friend_count),
                        n=n()) %>%
                        arrange(age_with_months)

Compare age and age with months. By inncreasing the bin size , we can get mean more precisely, but potentially missing important features of teh age.This plot is an example of bios and variance trade-off

p1=ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_wuth_months,age_with_months<71))+geom_line()

p2=ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age,age<71))+geom_line()


p3=ggplot(aes(x = round(age/5)*5, y = friend_count), 
          data = subset(pf,age<71))+
  geom_line(stat='summary',fun.y=mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1,p2,p3,ncol=1)

A flexible statistical model to smooth our estimates of conditional means . ggplot makes it easier fit such models using geom soomth().

p1=ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_wuth_months,age_with_months<71))+geom_line()+geom_smooth()

p2=ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age,age<71))+geom_line()+geom_smooth()

grid.arrange(p1,p2,ncol=1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'