getwd()
## [1] "C:/Users/User/Desktop/LessonR/lesson4"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
qplot(x = age, y = friend_count, data = pf)
qplot(age, friend_count, data = pf)
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_point() + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_point(alpha = 1/20) + xlim(13,90) +
coord_trans()
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_point(alpha = 1/20) + xlim(13,90) +
coord_trans(y = 'sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x = age, y = friendships_initiated), data = pf) +
geom_jitter(alpha = 1/10, position = position_jitter(h = 0) ) +
coord_trans(y = 'sqrt')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n= n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
###Alternate Code
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age, 20)
## # A tibble: 20 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) + geom_line()
ggplot(aes(x = age, y = friend_count), data = pf) +
coord_cartesian(xlim = c(13, 70), ylim = c(0,1000)) +
geom_point(alpha = 0.05,
position = position_jitter(h = 0),
color = 'orange') +
coord_trans(y = 'sqrt') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1),
linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5),
color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9),
linetype = 2, color = 'blue')
?cor.test
## starting httpd help server ...
## done
cor.test(pf$age, pf$friend_count, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with( subset(pf, age <=70) , cor.test(age, friend_count), method = c("pearson"))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
with(subset(pf, age <= 70), cor.test(age, friend_count), method = c("spearman"))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + geom_point()
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
cor.test(pf$www_likes_received, pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data(Mitchell)
?Mitchell
head(Mitchell)
## Month Temp
## 1 0 -5.18333
## 2 1 -1.65000
## 3 2 2.49444
## 4 3 10.40000
## 5 4 14.99440
## 6 5 21.71670
ggplot(aes(Month, Temp), data = Mitchell) + geom_point()
qplot(data = Mitchell, Month, Temp)
ggplot(aes(Month, Temp), data = Mitchell) + geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12))
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+
geom_point()
ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) + geom_line()
head(pf.fc_by_age, 10)
## # A tibble: 10 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
pf.fc_by_age[17:19, ]
## # A tibble: 3 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 29 120.8182 66.0 1936
## 2 30 115.2080 67.5 1716
## 3 31 118.4599 63.0 1694
pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12
Programming Assignment
age_month_groups <- group_by(pf, age_with_months)
pf.fc_by_age_month <- summarise(age_month_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_month <- arrange(pf.fc_by_age_month, age_with_months)
head(pf.fc_by_age_month)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
###Alternate code
pf.fc_by_age_month_alt <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_month_alt)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_month, age_with_months< 71)) + geom_line()
p1 <- ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_month, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age / 5)*5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'