What I learn in this course Scatter plots Conditional means Correlation co-efficient
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv',sep='\t')
scatter plot
qplot(x=age,y=friend_count,data=pf)
#qplot(age,frined_count,data=pf) #This also works
ggplot syntax
ggplot(aes(x=age,y=friend_count),data=pf)+geom_point()
#ggplot(aes(x=age,y=friend_count),data=pf)+geom_line()
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(aes(x=age,y=friend_count),data=pf)+geom_point()+xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
overplotting
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point(alpha=1/20)+xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_jitter(alpha=1/20)+
xlim(13,90)
## Warning: Removed 5155 rows containing missing values (geom_point).
coord_trans solution
ggplot(aes(x = age, y = friend_count), data = pf)+
geom_point(alpha = 1/20, position = position_jitter(h = 0))+
xlim(13,90)+
coord_trans(y='sqrt')
## Warning: Removed 5200 rows containing missing values (geom_point).
Friend count vs Age
ggplot(aes(y = friendships_initiated, x = age), data = pf)+geom_point()
ggplot(aes(y = friendships_initiated, x = age), data = pf)+
geom_point(alpha = 1/20, position = position_jitter(h = 0))+
xlim(13,90)
## Warning: Removed 5172 rows containing missing values (geom_point).
ggplot(aes(y = friendships_initiated, x = age), data = pf)+
geom_point(alpha = 1/20, position = position_jitter(h = 0))+
xlim(13,90)+
coord_trans(y='sqrt')
## Warning: Removed 5175 rows containing missing values (geom_point).
Cordinal means
#install.packages('dplyr')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups=group_by(pf,age)
pf.fc_by_age=summarize(age_groups,
friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n())
pf.fc_by_age=arrange(pf.fc_by_age,age)
pf.fc_by_age<- pf %>%
group_by(age) %>%
summarize(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age)
head(pf.fc_by_age, 20)
## # A tibble: 20 × 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
Plot average friend_count and the age
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_line()+
xlim(13,90)+
coord_trans(y='sqrt')
## Warning: Removed 23 rows containing missing values (geom_path).
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_line()+
xlim(13,90)+
coord_trans(y='log10')
## Warning: Removed 23 rows containing missing values (geom_path).
ggplot(aes(x = friend_count_median, y = friend_count_mean), data = pf.fc_by_age)+geom_point()
ggplot(aes(x = friend_count_median, y = friend_count_mean), data = pf.fc_by_age)+geom_point()
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_line()
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age)+geom_point()
ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age)+geom_point()
Overlaying summaries with Row data
ggplot(aes(y = friendships_initiated, x = age), data = pf)+
xlim(13,90)+
geom_point(alpha = 0.05, position = position_jitter(h = 0),colour='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5173 rows containing missing values (geom_point).
adding quantiles
ggplot(aes(y = friendships_initiated, x = age), data = pf)+
xlim(13,90)+
geom_point(alpha = 0.05, position = position_jitter(h = 0),colour='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.9),
linetype=2,color='blue')+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.1),
linetype=2,color='blue')+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.5),
linetype=2,color='red')
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5180 rows containing missing values (geom_point).
Limiting and zooming into the data
ggplot(aes(y = friendships_initiated, x = age), data = pf)+
xlim(13,70)+ylim(0,1000)+
geom_point(alpha = 0.05, position = position_jitter(h = 0),colour='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.9),
linetype=2,color='blue')+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.1),
linetype=2,color='blue')+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.5),
linetype=2,color='red')
## Warning: Removed 8693 rows containing non-finite values (stat_summary).
## Warning: Removed 8693 rows containing non-finite values (stat_summary).
## Warning: Removed 8693 rows containing non-finite values (stat_summary).
## Warning: Removed 8693 rows containing non-finite values (stat_summary).
## Warning: Removed 9119 rows containing missing values (geom_point).
Correlation methods
# ?cor.test() # For documentation
cor.test(pf$age,pf$friend_count,method='pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(pf,cor.test(age,friend_count,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(subset(pf,pf$age<=70),cor.test(age,friend_count,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
with(subset(pf,pf$age<=70),cor.test(age,friend_count,method='spearman'))
## Warning in cor.test.default(age, friend_count, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2552934
ggplot(aes(y = likes_received, x = www_likes_received), data = pf)+
xlim(13,70)+ylim(0,1000)+
geom_point(alpha = .05, position = position_jitter(h = 0),colour='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.9),
linetype=2,color='blue')+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.1),
linetype=2,color='blue')+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs = 0.5),
linetype=2,color='red')
## Warning: Removed 81243 rows containing non-finite values (stat_summary).
## Warning: Removed 81243 rows containing non-finite values (stat_summary).
## Warning: Removed 81243 rows containing non-finite values (stat_summary).
## Warning: Removed 81243 rows containing non-finite values (stat_summary).
## Warning: Removed 81753 rows containing missing values (geom_point).
cor.test(pf$www_likes_received,pf$likes_received,method='pearson')
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
with(pf,cor.test(www_likes_received,likes_received,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
with(subset(pf,pf$age<=70),cor.test(www_likes_received,likes_received,method='pearson'))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 928.32, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9504075 0.9516488
## sample estimates:
## cor
## 0.951032
with(subset(pf,pf$age<=70),cor.test(www_likes_received,likes_received,method='spearman'))
## Warning in cor.test.default(www_likes_received, likes_received, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: www_likes_received and likes_received
## S = 9.6868e+12, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9229515
simple plot(www_likes_receive vs likes_received)
ggplot(aes(y = likes_received, x = www_likes_received), data = pf)+geom_point()
ggplot(aes(y = likes_received, x = www_likes_received), data = pf)+
geom_point()+
xlim(0,quantile(pf$www_likes_received, .95))+
ylim(0,quantile(pf$likes_received, .95))+
geom_smooth(method='lm',color='red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
with(pf,cor.test(www_likes_received,likes_received,method='pearson')) #This is not a surprising tis because one is the subset of the other
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
#install.packages('alr3')
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data(Mitchell)
?Mitchell
head(Mitchell,n=20L)
## Month Temp
## 1 0 -5.18333
## 2 1 -1.65000
## 3 2 2.49444
## 4 3 10.40000
## 5 4 14.99440
## 6 5 21.71670
## 7 6 24.74440
## 8 7 24.07220
## 9 8 18.86110
## 10 9 9.16667
## 11 10 1.54444
## 12 11 -4.00556
## 13 12 -7.47778
## 14 13 -4.00556
## 15 14 3.00000
## 16 15 10.96110
## 17 16 18.02220
## 18 17 22.72780
## 19 18 26.26110
## 20 19 21.60560
ggplot(aes(y=Temp,x=Month),data=Mitchell)+
geom_point()
#qplot(Temp,Month,data=Mitchell)
cor.test(Mitchell$Month,Mitchell$Temp,method='pearson')
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
ggplot(aes(y=Temp*.1,x=Month),data=Mitchell)+
geom_point()+scale_x_continuous(breaks = seq(0,204,12))
ggplot(aes(y=Temp*.1,x=Month),data=Mitchell)+
geom_line()+scale_x_continuous(breaks = seq(0,204,12))
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+
geom_point()
#pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
pf$age_with_months <- with(pf, age + (12 - dob_month / 12))
#pf$age_with_months<-pf$age_with_months*12
pf.fc_by_age_wuth_months<- pf %>%
group_by(age_with_months) %>%
summarize(friend_count_mean=mean(friend_count),
friend_count_median=median(friend_count),
n=n()) %>%
arrange(age_with_months)
Compare age and age with months. By inncreasing the bin size , we can get mean more precisely, but potentially missing important features of teh age.This plot is an example of bios and variance trade-off
p1=ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_wuth_months,age_with_months<71))+geom_line()
p2=ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age,age<71))+geom_line()
p3=ggplot(aes(x = round(age/5)*5, y = friend_count),
data = subset(pf,age<71))+
geom_line(stat='summary',fun.y=mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1,p2,p3,ncol=1)
A flexible statistical model to smooth our estimates of conditional means . ggplot makes it easier fit such models using geom soomth().
p1=ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_wuth_months,age_with_months<71))+geom_line()+geom_smooth()
p2=ggplot(aes(x = age, y = friend_count_mean), data = subset(pf.fc_by_age,age<71))+geom_line()+geom_smooth()
grid.arrange(p1,p2,ncol=1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'