Lesson 5: Explore Two Variables

getwd()
## [1] "C:/Users/User/Desktop/LessonR/lesson4"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
qplot(x = age, y = friend_count, data = pf)

qplot(age, friend_count, data = pf)

ggplot(aes(x=age, y=friend_count), data=pf) +
  geom_point() + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
ggplot(aes(x=age, y=friend_count), data=pf) + 
  geom_point(alpha = 1/20) + xlim(13,90) +
  coord_trans()
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(aes(x=age, y=friend_count), data=pf) + 
  geom_point(alpha = 1/20) + xlim(13,90) +
  coord_trans(y = 'sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(aes(x = age, y = friendships_initiated), data = pf) + 
  geom_jitter(alpha = 1/10, position = position_jitter(h = 0) ) +
  coord_trans(y = 'sqrt')

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
                          friend_count_mean = mean(friend_count),
                          friend_count_median = median(friend_count),
                          n= n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)

head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196
###Alternate Code
pf.fc_by_age <- pf %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age)

head(pf.fc_by_age, 20)
## # A tibble: 20 x 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
##  1    13          164.7500                74.0   484
##  2    14          251.3901               132.0  1925
##  3    15          347.6921               161.0  2618
##  4    16          351.9371               171.5  3086
##  5    17          350.3006               156.0  3283
##  6    18          331.1663               162.0  5196
##  7    19          333.6921               157.0  4391
##  8    20          283.4991               135.0  3769
##  9    21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
## 11    23          202.8426                93.0  4404
## 12    24          185.7121                92.0  2827
## 13    25          131.0211                62.0  3641
## 14    26          144.0082                75.0  2815
## 15    27          134.1473                72.0  2240
## 16    28          125.8354                66.0  2364
## 17    29          120.8182                66.0  1936
## 18    30          115.2080                67.5  1716
## 19    31          118.4599                63.0  1694
## 20    32          114.2800                63.0  1443
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) + geom_line()

ggplot(aes(x = age, y = friend_count), data = pf) +
  coord_cartesian(xlim = c(13, 70), ylim = c(0,1000)) +
  geom_point(alpha = 0.05,
             position = position_jitter(h = 0),
             color = 'orange') +
  coord_trans(y = 'sqrt') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1),
            linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5),
            color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9),
            linetype = 2, color = 'blue')

?cor.test
## starting httpd help server ...
##  done
cor.test(pf$age, pf$friend_count, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with( subset(pf, age <=70)                , cor.test(age, friend_count), method = c("pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245
with(subset(pf, age <= 70), cor.test(age, friend_count), method = c("spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245
ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + geom_point()

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + geom_point() +
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) +
  geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

cor.test(pf$www_likes_received, pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data(Mitchell)
?Mitchell
head(Mitchell)
##   Month     Temp
## 1     0 -5.18333
## 2     1 -1.65000
## 3     2  2.49444
## 4     3 10.40000
## 5     4 14.99440
## 6     5 21.71670
ggplot(aes(Month, Temp), data = Mitchell) + geom_point()

qplot(data = Mitchell, Month, Temp)

ggplot(aes(Month, Temp), data = Mitchell) + geom_point() +
  scale_x_continuous(breaks = seq(0, 203, 12))

ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+ 
  geom_point() 

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) + geom_line()

head(pf.fc_by_age, 10)
## # A tibble: 10 x 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
##  1    13          164.7500                74.0   484
##  2    14          251.3901               132.0  1925
##  3    15          347.6921               161.0  2618
##  4    16          351.9371               171.5  3086
##  5    17          350.3006               156.0  3283
##  6    18          331.1663               162.0  5196
##  7    19          333.6921               157.0  4391
##  8    20          283.4991               135.0  3769
##  9    21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
pf.fc_by_age[17:19, ]
## # A tibble: 3 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    29          120.8182                66.0  1936
## 2    30          115.2080                67.5  1716
## 3    31          118.4599                63.0  1694
pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12

Programming Assignment

age_month_groups <- group_by(pf, age_with_months) 
pf.fc_by_age_month <- summarise(age_month_groups, 
                                                    friend_count_mean = mean(friend_count), 
                                                    friend_count_median = median(friend_count), 
                                                    n = n()) 
pf.fc_by_age_month <- arrange(pf.fc_by_age_month, age_with_months) 

head(pf.fc_by_age_month)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54
###Alternate code
pf.fc_by_age_month_alt <- pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age_with_months)

head(pf.fc_by_age_month_alt)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1        13.16667          46.33333                30.5     6
## 2        13.25000         115.07143                23.5    14
## 3        13.33333         136.20000                44.0    25
## 4        13.41667         164.24242                72.0    33
## 5        13.50000         131.17778                66.0    45
## 6        13.58333         156.81481                64.0    54
ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_month, age_with_months< 71)) + geom_line()

p1 <- ggplot(aes(x = age, y = friend_count_mean),
             data = subset(pf.fc_by_age, age < 71)) +
  geom_line() +
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
             data = subset(pf.fc_by_age_month, age_with_months < 71)) +
  geom_line() +
  geom_smooth()

p3 <- ggplot(aes(x = round(age / 5)*5, y = friend_count),
             data = subset(pf, age < 71)) +
  geom_line(stat = 'summary', fun.y = mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, p3, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'