Import csv file into RStudio
getwd()
## [1] "C:/Users/sebas/OneDrive/Desktop/BANA270_Midterm_Kathleen Sebastian"
setwd("C:/Users/sebas/OneDrive/Desktop/BANA270_Midterm_Kathleen Sebastian")
df <- read.csv("HighNote Data Midterm.csv")
head(df)
## ID age male friend_cnt avg_friend_age avg_friend_male friend_country_cnt
## 1 1 22 0 8 22.57143 0.4285714 1
## 2 2 35 0 2 28.00000 1.0000000 2
## 3 3 27 1 2 23.00000 1.0000000 1
## 4 4 21 0 28 22.94737 0.5000000 7
## 5 5 24 0 65 22.28302 0.9137931 9
## 6 6 21 1 12 25.00000 0.7777778 1
## subscriber_friend_cnt songsListened lovedTracks posts playlists shouts
## 1 0 9687 194 0 1 8
## 2 0 0 0 0 0 0
## 3 0 508 0 0 1 2
## 4 1 1357 32 0 0 1
## 5 0 89984 20 2 0 81
## 6 0 124547 10 0 1 2
## adopter tenure good_country
## 1 0 59 1
## 2 0 35 0
## 3 0 42 0
## 4 0 25 0
## 5 0 67 0
## 6 0 53 1
Install and/or load necessary packages for data analysis
library(ggplot2)
library(ggcorrplot)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MatchIt)
library(tidyverse)
## -- Attaching packages ------------------------------------------ tidyverse 1.3.0 --
## v tibble 2.1.3 v purrr 0.3.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(purrr)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggthemes)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(wesanderson)
library(e1071)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
Generate descriptive statisticsfor HighNote dataset by adopter variable
describe.by(df, group=df$adopter)
## Warning: describe.by is deprecated. Please use the describeBy function
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad
## ID 1 40300 20150.50 11633.75 20150.50 20150.50 14937.19
## age 2 40300 23.95 6.37 23.00 23.09 4.45
## male 3 40300 0.62 0.48 1.00 0.65 0.00
## friend_cnt 4 40300 18.49 57.48 7.00 10.28 7.41
## avg_friend_age 5 40300 24.01 5.10 23.00 23.40 3.95
## avg_friend_male 6 40300 0.62 0.32 0.67 0.65 0.35
## friend_country_cnt 7 40300 3.96 5.76 2.00 2.66 1.48
## subscriber_friend_cnt 8 40300 0.42 2.42 0.00 0.13 0.00
## songsListened 9 40300 17589.44 28416.02 7440.00 11817.64 10576.87
## lovedTracks 10 40300 86.82 263.58 14.00 36.35 20.76
## posts 11 40300 5.29 104.31 0.00 0.23 0.00
## playlists 12 40300 0.55 1.07 0.00 0.45 0.00
## shouts 13 40300 29.97 150.69 4.00 8.84 4.45
## adopter 14 40300 0.00 0.00 0.00 0.00 0.00
## tenure 15 40300 43.81 19.79 44.00 43.72 22.24
## good_country 16 40300 0.36 0.48 0.00 0.32 0.00
## min max range skew kurtosis se
## ID 1 40300 40299 0.00 -1.20 57.95
## age 8 79 71 1.97 6.80 0.03
## male 0 1 1 -0.50 -1.75 0.00
## friend_cnt 1 4957 4956 32.67 2087.42 0.29
## avg_friend_age 8 77 69 1.84 7.15 0.03
## avg_friend_male 0 1 1 -0.52 -0.72 0.00
## friend_country_cnt 0 129 129 4.74 38.29 0.03
## subscriber_friend_cnt 0 309 309 72.19 8024.62 0.01
## songsListened 0 1000000 1000000 6.05 105.85 141.55
## lovedTracks 0 12522 12522 13.12 335.93 1.31
## posts 0 12309 12309 73.92 7005.34 0.52
## playlists 0 98 98 28.21 1945.28 0.01
## shouts 0 7736 7736 22.53 779.12 0.75
## adopter 0 0 0 NaN NaN 0.00
## tenure 1 111 110 0.05 -0.70 0.10
## good_country 0 1 1 0.59 -1.65 0.00
## ------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad
## ID 1 3527 42064.00 1018.30 42064.00 42064.00 1307.65
## age 2 3527 25.98 6.84 24.00 25.05 4.45
## male 3 3527 0.73 0.44 1.00 0.79 0.00
## friend_cnt 4 3527 39.73 117.27 16.00 23.69 17.79
## avg_friend_age 5 3527 25.44 5.21 24.36 24.83 3.91
## avg_friend_male 6 3527 0.64 0.25 0.67 0.65 0.25
## friend_country_cnt 7 3527 7.19 8.86 4.00 5.36 4.45
## subscriber_friend_cnt 8 3527 1.64 5.85 0.00 0.84 0.00
## songsListened 9 3527 33758.04 43592.73 20908.00 25811.69 23276.82
## lovedTracks 10 3527 264.34 491.43 108.00 161.68 140.85
## posts 11 3527 21.20 221.99 0.00 1.44 0.00
## playlists 12 3527 0.90 2.56 1.00 0.59 1.48
## shouts 13 3527 99.44 1156.07 9.00 23.89 11.86
## adopter 14 3527 1.00 0.00 1.00 1.00 0.00
## tenure 15 3527 45.58 20.04 46.00 45.60 20.76
## good_country 16 3527 0.29 0.45 0.00 0.23 0.00
## min max range skew kurtosis se
## ID 40301 43827 3526 0.00 -1.20 17.15
## age 8 73 65 1.68 4.39 0.12
## male 0 1 1 -1.03 -0.94 0.01
## friend_cnt 1 5089 5088 26.04 1013.79 1.97
## avg_friend_age 12 62 50 1.68 5.05 0.09
## avg_friend_male 0 1 1 -0.54 -0.05 0.00
## friend_country_cnt 0 136 136 3.61 24.53 0.15
## subscriber_friend_cnt 0 287 287 34.05 1609.52 0.10
## songsListened 0 817290 817290 4.71 46.64 734.03
## lovedTracks 0 10220 10220 6.52 80.96 8.27
## posts 0 8506 8506 26.52 852.38 3.74
## playlists 0 118 118 28.84 1244.31 0.04
## shouts 0 65872 65872 52.52 2969.09 19.47
## adopter 1 1 0 NaN NaN 0.00
## tenure 0 111 111 0.02 -0.62 0.34
## good_country 0 1 1 0.94 -1.12 0.01
Conduct t-test to identify differences in means, selecting ONLY relevant variables from the dataset
lapply(df[,c('age','male' , 'friend_cnt' , 'avg_friend_male' ,'avg_friend_age',
'friend_country_cnt' , 'songsListened' , 'lovedTracks' ,
'posts' , 'playlists' ,'shouts' , 'tenure' ,'good_country', 'subscriber_friend_cnt')], function(a) t.test(a ~ df$adopter))
## $age
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -16.996, df = 4079.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.265768 -1.797097
## sample estimates:
## mean in group 0 mean in group 1
## 23.94844 25.97987
##
##
## $male
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -13.654, df = 4295, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.12278707 -0.09195413
## sample estimates:
## mean in group 0 mean in group 1
## 0.6218610 0.7292316
##
##
## $friend_cnt
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -10.646, df = 3675.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25.15422 -17.32999
## sample estimates:
## mean in group 0 mean in group 1
## 18.49166 39.73377
##
##
## $avg_friend_male
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -4.4426, df = 4591.6, p-value = 9.097e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02883955 -0.01117951
## sample estimates:
## mean in group 0 mean in group 1
## 0.6165888 0.6365983
##
##
## $avg_friend_age
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -15.658, df = 4140.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.608931 -1.250852
## sample estimates:
## mean in group 0 mean in group 1
## 24.01142 25.44131
##
##
## $friend_country_cnt
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -21.267, df = 3791.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.528795 -2.933081
## sample estimates:
## mean in group 0 mean in group 1
## 3.957891 7.188829
##
##
## $songsListened
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -21.629, df = 3792.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17634.24 -14702.96
## sample estimates:
## mean in group 0 mean in group 1
## 17589.44 33758.04
##
##
## $lovedTracks
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -21.188, df = 3705.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -193.9447 -161.0917
## sample estimates:
## mean in group 0 mean in group 1
## 86.82263 264.34080
##
##
## $posts
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -4.2151, df = 3663.5, p-value = 2.557e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.30665 -8.50825
## sample estimates:
## mean in group 0 mean in group 1
## 5.293002 21.200454
##
##
## $playlists
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -8.0816, df = 3634.7, p-value = 8.619e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4367565 -0.2662138
## sample estimates:
## mean in group 0 mean in group 1
## 0.5492804 0.9007655
##
##
## $shouts
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -3.5659, df = 3536.5, p-value = 0.0003674
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -107.66170 -31.27249
## sample estimates:
## mean in group 0 mean in group 1
## 29.97266 99.43975
##
##
## $tenure
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -5.0434, df = 4150.6, p-value = 4.768e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.462620 -1.083959
## sample estimates:
## mean in group 0 mean in group 1
## 43.80993 45.58322
##
##
## $good_country
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = 8.8009, df = 4248.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05463587 0.08595434
## sample estimates:
## mean in group 0 mean in group 1
## 0.3577916 0.2874965
##
##
## $subscriber_friend_cnt
##
## Welch Two Sample t-test
##
## data: a by df$adopter
## t = -12.287, df = 3632.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.413899 -1.024766
## sample estimates:
## mean in group 0 mean in group 1
## 0.417469 1.636802
Generate relevant visualizations pertaining to the original HighNote dataset Correlation Matrix
corr <- round(cor(df), 1)
ggcorrplot(corr, hc.order = TRUE, type = "lower",
lab = TRUE)
Create new dataframe for visualizations purposes, converting adopter variable to a character
df1 <- df
df1$adopter <- as.character(df1$adopter)
VISUALIZATIONS BY DEMOGRAPHIC DATA age
ggplot(df1, aes(x=adopter, y=age, group=adopter, fill=adopter,)) + geom_boxplot() +
labs(title="Boxplot of Adopter vs. Non-Adopter by Age",x="Adopter", y = "Age") +
scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest1"))
male
ggplot(df1, aes(x=male, group=adopter, fill=adopter)) + geom_bar(aes(fill=adopter))+
labs(title="Barchart of Adopter vs. Non-Adopter by Gender",x="Gender") +
scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest1"))
good_country
ggplot(df1, aes(x=good_country, group=adopter, fill=adopter)) + geom_bar(aes(fill=adopter))+
labs(title="Barchart of Adopter vs. Non-Adopter by Good Country",x="Good Country") +
scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest1"))
VISUALIZATIONS BY PEER INFLUENCE friend_cnt
ggplot(df1, aes(x=age, y=friend_cnt, col=adopter)) + geom_point(alpha=0.5) +
scale_fill_manual(values=wes_palette(name="GrandBudapest1")) + labs(title="Scatter of Age vs. Friend Count by Adopter vs. Non Adopter")
avg_friend_age
ggplot(df1,aes(x=avg_friend_age,group=adopter,fill=adopter))+
geom_histogram(position="identity",alpha=0.8,binwidth=1.0)+labs(title="Histogram of Average Friend Age by Adopter vs. Non.Adopter")+
scale_fill_manual(values=wes_palette(name="GrandBudapest1"))
avg_friend_male
ggplot(df1,aes(x=avg_friend_age,group=adopter,fill=adopter))+
geom_histogram(position="identity",alpha=0.8,binwidth=1.0)+labs(title="Histogram of Average Friend Age by Adopter vs. Non.Adopter")+
scale_fill_manual(values=wes_palette(name="GrandBudapest1"))
friend_country_cnt
ggplot(df1, aes(x=adopter, y=friend_country_cnt)) +
geom_bar(aes(fill = adopter), position = "dodge", stat="identity") +
scale_fill_manual(values=wes_palette(name="GrandBudapest1")) +
labs(title="Barchart for Friend Country Count by Adopter vs. Non-Adopter")
subscriber_friend_cnt
avg_subs_friend_cnt <- df1 %>%
group_by(adopter)%>%
summarise(subscriber_friend_cnt=mean(subscriber_friend_cnt))
ggplot(avg_subs_friend_cnt, aes(x=avg_subs_friend_cnt$adopter, y=avg_subs_friend_cnt$subscriber_friend_cnt)) +
geom_bar(aes(fill = adopter), position = "dodge", stat="identity") +
scale_fill_manual(values=wes_palette(name="GrandBudapest1")) +
labs(title="Barchart for Average Subscriber Friend Count by Adopter vs. Non-Adopter",x="Adopter",y="Average Subscriber Friend Count")
Pre-analysis using non-matched data DUMMY VARIABLE CREATION: Group to create dataframe by subscriber variable count variable, where if subscriber_friend_cnt >- 1, then this is 1 or the treatment group, and 0 if subscriber subscriber_friend_cnt = 0 or the control group
df$subscriber_friend_cnt <- ifelse(df$subscriber_friend_cnt >=1,1,0)
with(df, t.test(subscriber_friend_cnt ~ adopter))
##
## Welch Two Sample t-test
##
## data: subscriber_friend_cnt by adopter
## t = -33.978, df = 3931.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3109641 -0.2770354
## sample estimates:
## mean in group 0 mean in group 1
## 0.2004715 0.4944712
Difference in means for pre-treatment covarities
df_cov <- c('age','male' , 'friend_cnt' , 'avg_friend_male' ,'avg_friend_age',
'friend_country_cnt' , 'songsListened' , 'lovedTracks' ,
'posts' , 'playlists' ,'shouts' , 'tenure' ,'good_country')
diff_in_means_covariteis <- df %>%
group_by(adopter) %>%
select(one_of(df_cov)) %>%
summarise_all(funs(mean(., na.rm = T)))
## Adding missing grouping variables: `adopter`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
Test whether means are significant
with(df, t.test(age ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: age by subscriber_friend_cnt
## t = -20.841, df = 14645, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.778544 -1.472749
## sample estimates:
## mean in group 0 mean in group 1
## 23.74756 25.37321
with(df, t.test(friend_cnt ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: friend_cnt by subscriber_friend_cnt
## t = -33.707, df = 9903.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -46.12459 -41.05469
## sample estimates:
## mean in group 0 mean in group 1
## 10.43133 54.02097
with(df, t.test(male ~ subscriber_friend_cnt))
##
## Welch Two Sample t-test
##
## data: male by subscriber_friend_cnt
## t = -1.3459, df = 15986, p-value = 0.1784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.018236129 0.003388028
## sample estimates:
## mean in group 0 mean in group 1
## 0.6288378 0.6362618
with(df, t.test(tenure ~ subscriber_friend_cnt))
##
## Welch Two Sample t-test
##
## data: tenure by subscriber_friend_cnt
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1
## 43.20268 46.54871
with(df, t.test(good_country ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: good_country by subscriber_friend_cnt
## t = 2.0956, df = 16030, p-value = 0.03613
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0007383591 0.0220968020
## sample estimates:
## mean in group 0 mean in group 1
## 0.3546936 0.3432760
with(df, t.test(songsListened ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: songsListened by subscriber_friend_cnt
## t = -41.505, df = 11447, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -20037.04 -18229.80
## sample estimates:
## mean in group 0 mean in group 1
## 14602.22 33735.64
with(df, t.test(lovedTracks ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: lovedTracks by subscriber_friend_cnt
## t = -31.265, df = 10585, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -170.1918 -150.1102
## sample estimates:
## mean in group 0 mean in group 1
## 65.21365 225.36465
with(df, t.test(posts ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: posts by subscriber_friend_cnt
## t = -7.3649, df = 9933.6, p-value = 1.914e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22.76492 -13.19424
## sample estimates:
## mean in group 0 mean in group 1
## 2.543377 20.522956
with(df, t.test(tenure ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: tenure by subscriber_friend_cnt
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1
## 43.20268 46.54871
with(df, t.test(avg_friend_male ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: avg_friend_male by subscriber_friend_cnt
## t = -7.7114, df = 23020, p-value = 1.294e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02846397 -0.01692672
## sample estimates:
## mean in group 0 mean in group 1
## 0.6131124 0.6358077
with(df, t.test(avg_friend_age ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: avg_friend_age by subscriber_friend_cnt
## t = -27.658, df = 15667, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.744514 -1.513611
## sample estimates:
## mean in group 0 mean in group 1
## 23.76137 25.39043
with(df, t.test(friend_country_cnt ~ subscriber_friend_cnt ))
##
## Welch Two Sample t-test
##
## data: friend_country_cnt by subscriber_friend_cnt
## t = -65.05, df = 10372, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.861271 -6.459857
## sample estimates:
## mean in group 0 mean in group 1
## 2.725062 9.385626
Propensity Score Matching; given all means significant for all, use all features related to the outcome variable
hn_ps <- glm(subscriber_friend_cnt ~ age + male + good_country + avg_friend_age + avg_friend_male + friend_country_cnt + songsListened +
lovedTracks + posts + playlists + shouts + tenure + friend_cnt,
family = binomial(), data = df)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(hn_ps)
##
## Call:
## glm(formula = subscriber_friend_cnt ~ age + male + good_country +
## avg_friend_age + avg_friend_male + friend_country_cnt + songsListened +
## lovedTracks + posts + playlists + shouts + tenure + friend_cnt,
## family = binomial(), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.4206 -0.5671 -0.4220 -0.3001 2.5619
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.144e+00 7.703e-02 -66.782 < 2e-16 ***
## age 1.970e-02 2.808e-03 7.015 2.30e-12 ***
## male 4.311e-02 2.998e-02 1.438 0.150419
## good_country 3.201e-02 2.922e-02 1.096 0.273235
## avg_friend_age 7.955e-02 3.481e-03 22.850 < 2e-16 ***
## avg_friend_male 2.514e-01 5.029e-02 4.999 5.75e-07 ***
## friend_country_cnt 1.110e-01 4.765e-03 23.302 < 2e-16 ***
## songsListened 6.906e-06 5.156e-07 13.396 < 2e-16 ***
## lovedTracks 6.671e-04 5.645e-05 11.817 < 2e-16 ***
## posts 5.699e-04 2.682e-04 2.125 0.033613 *
## playlists 5.639e-03 1.190e-02 0.474 0.635530
## shouts -4.909e-05 3.707e-05 -1.324 0.185434
## tenure -2.571e-03 7.769e-04 -3.309 0.000935 ***
## friend_cnt 3.132e-02 1.034e-03 30.301 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 46640 on 43826 degrees of freedom
## Residual deviance: 34170 on 43813 degrees of freedom
## AIC: 34198
##
## Number of Fisher Scoring iterations: 7
Use model to calculate propensity score and generate new dataframe
ps_df <- data.frame(pr_score = predict(hn_ps, type = "response"),
subscriber_friend_cnt = hn_ps$model$subscriber_friend_cnt)
head(ps_df)
## pr_score subscriber_friend_cnt
## 1 0.08597334 0
## 2 0.14417767 0
## 3 0.08217010 0
## 4 0.23894067 1
## 5 0.69552208 0
## 6 0.22306633 0
Conduct examiniation of common support by creating a histogram to plot the respective propensity scores by treatment status
labs <- paste("HighNote User Type:", c("Treatment Group", "Control Group"))
ps_df %>%
mutate(adopter = ifelse(subscriber_friend_cnt == 1, labs[1], labs[2])) %>%
ggplot(aes(x = pr_score)) +
geom_histogram(fill = "plum", color = "black") +
facet_wrap(~adopter) +
xlab("Probability of User Being in Treatment Group") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Use matchit package to identify pairs of observations with similar propensity scores but are distinct in treatment status.
df_nomissingvals <- df %>% select(subscriber_friend_cnt, adopter, one_of(df_cov)) %>%
na.omit()
head(df_nomissingvals)
## subscriber_friend_cnt adopter age male friend_cnt avg_friend_male
## 1 0 0 22 0 8 0.4285714
## 2 0 0 35 0 2 1.0000000
## 3 0 0 27 1 2 1.0000000
## 4 1 0 21 0 28 0.5000000
## 5 0 0 24 0 65 0.9137931
## 6 0 0 21 1 12 0.7777778
## avg_friend_age friend_country_cnt songsListened lovedTracks posts playlists
## 1 22.57143 1 9687 194 0 1
## 2 28.00000 2 0 0 0 0
## 3 23.00000 1 508 0 0 1
## 4 22.94737 7 1357 32 0 0
## 5 22.28302 9 89984 20 2 0
## 6 25.00000 1 124547 10 0 1
## shouts tenure good_country
## 1 8 59 1
## 2 0 35 0
## 3 2 42 0
## 4 1 25 0
## 5 81 67 0
## 6 2 53 1
mod_match <- matchit(subscriber_friend_cnt ~ age + male + good_country + avg_friend_age + avg_friend_male + friend_country_cnt + songsListened +
lovedTracks + posts + playlists + shouts + tenure + friend_cnt,
method = "nearest", data = df_nomissingvals)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod_match)
##
## Call:
## matchit(formula = subscriber_friend_cnt ~ age + male + good_country +
## avg_friend_age + avg_friend_male + friend_country_cnt + songsListened +
## lovedTracks + posts + playlists + shouts + tenure + friend_cnt,
## data = df_nomissingvals, method = "nearest")
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.4635 0.1550 0.1436 0.3086 0.2506
## age 25.3732 23.7476 6.2245 1.6256 1.0000
## male 0.6363 0.6288 0.4831 0.0074 0.0000
## good_country 0.3433 0.3547 0.4784 -0.0114 0.0000
## avg_friend_age 25.3904 23.7614 5.0577 1.6291 1.5909
## avg_friend_male 0.6358 0.6131 0.3343 0.0227 0.0738
## friend_country_cnt 9.3856 2.7251 3.1024 6.6606 5.0000
## songsListened 33735.6404 14602.2205 23214.2898 19133.4199 15471.0000
## lovedTracks 225.3647 65.2137 181.4812 160.1510 65.0000
## posts 20.5230 2.5434 33.7947 17.9796 0.0000
## playlists 0.7441 0.5295 0.9673 0.2146 0.0000
## shouts 101.8195 16.4230 79.7381 85.3965 15.0000
## tenure 46.5487 43.2027 19.7212 3.3460 3.0000
## friend_cnt 54.0210 10.4313 15.2769 43.5896 22.0000
## eQQ Mean eQQ Max
## distance 0.3086 0.6840
## age 1.6296 5.0000
## male 0.0074 1.0000
## good_country 0.0114 1.0000
## avg_friend_age 1.6369 11.5000
## avg_friend_male 0.0958 0.3636
## friend_country_cnt 6.6598 95.0000
## songsListened 19126.1623 653702.0000
## lovedTracks 159.9562 6343.0000
## posts 17.8829 9535.0000
## playlists 0.2092 26.0000
## shouts 85.1764 59168.0000
## tenure 3.3473 10.0000
## friend_cnt 43.5838 4794.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.4635 0.3040 0.1913 0.1596 0.1077
## age 25.3732 26.3324 7.9056 -0.9592 1.0000
## male 0.6363 0.6576 0.4745 -0.0214 0.0000
## good_country 0.3433 0.3581 0.4795 -0.0149 0.0000
## avg_friend_age 25.3904 26.5572 6.7320 -1.1668 0.4376
## avg_friend_male 0.6358 0.6551 0.2643 -0.0193 0.0158
## friend_country_cnt 9.3856 5.0914 4.6473 4.2942 2.0000
## songsListened 33735.6404 27360.8630 33892.7804 6374.7775 4680.0000
## lovedTracks 225.3647 134.5440 299.1995 90.8206 38.0000
## posts 20.5230 6.2773 60.2598 14.2456 0.0000
## playlists 0.7441 0.6723 1.4015 0.0718 0.0000
## shouts 101.8195 37.2362 138.8781 64.5833 10.0000
## tenure 46.5487 47.7039 19.0357 -1.1551 1.0000
## friend_cnt 54.0210 21.4666 23.5251 32.5544 12.0000
## eQQ Mean eQQ Max
## distance 0.1596 0.4517
## age 0.9592 7.0000
## male 0.0214 1.0000
## good_country 0.0149 1.0000
## avg_friend_age 1.2763 14.0000
## avg_friend_male 0.0326 0.1602
## friend_country_cnt 4.2942 95.0000
## songsListened 6374.7775 566867.0000
## lovedTracks 90.8206 6180.0000
## posts 14.2456 9535.0000
## playlists 0.1035 22.0000
## shouts 64.5833 59168.0000
## tenure 1.2995 4.0000
## friend_cnt 32.5544 4794.0000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 48.2930 57.0083 48.2908 33.9658
## age 40.9972 0.0000 41.1419 -40.0000
## male -187.9614 0.0000 -187.6712 0.0000
## good_country -30.1771 0.0000 -30.3571 0.0000
## avg_friend_age 28.3760 72.4916 22.0309 -21.7391
## avg_friend_male 14.7957 78.6165 65.9532 55.9466
## friend_country_cnt 35.5279 60.0000 35.5203 0.0000
## songsListened 66.6825 69.7499 66.6699 13.2836
## lovedTracks 43.2906 41.5385 43.2216 2.5698
## posts 20.7676 0.0000 20.3394 0.0000
## playlists 66.5567 0.0000 50.5109 15.3846
## shouts 24.3724 33.3333 24.1770 0.0000
## tenure 65.4771 66.6667 61.1782 60.0000
## friend_cnt 25.3162 45.4545 25.3062 0.0000
##
## Sample sizes:
## Control Treated
## All 34004 9823
## Matched 9823 9823
## Unmatched 24181 0
## Discarded 0 0
plot(mod_match)
plot(mod_match,type="hist")
Generate a new dataframe containing matched values
matched_df <- match.data(mod_match)
head(matched_df)
## subscriber_friend_cnt adopter age male friend_cnt avg_friend_male
## 1 0 0 22 0 8 0.4285714
## 4 1 0 21 0 28 0.5000000
## 5 0 0 24 0 65 0.9137931
## 6 0 0 21 1 12 0.7777778
## 7 0 0 20 0 15 0.6363636
## 8 1 0 23 1 57 0.5208333
## avg_friend_age friend_country_cnt songsListened lovedTracks posts playlists
## 1 22.57143 1 9687 194 0 1
## 4 22.94737 7 1357 32 0 0
## 5 22.28302 9 89984 20 2 0
## 6 25.00000 1 124547 10 0 1
## 7 22.00000 1 24852 391 6 1
## 8 23.63636 14 99877 125 89 1
## shouts tenure good_country distance weights
## 1 8 59 1 0.08597334 1
## 4 1 25 0 0.23894067 1
## 5 81 67 0 0.69552208 1
## 6 2 53 1 0.22306633 1
## 7 67 56 1 0.12644080 1
## 8 44 71 0 0.79381453 1
dim(matched_df)
## [1] 19646 17
matcheddata_mean <- matched_df%>%
group_by(subscriber_friend_cnt) %>%
select(one_of(df_cov)) %>%
summarise_all(funs(mean))
## Adding missing grouping variables: `subscriber_friend_cnt`
View(matcheddata_mean)
Second method to assess the covariate balance in matched sample: difference in means test
matched_df%>%
group_by(adopter) %>%
select(one_of(df_cov)) %>%
summarise_all(funs(mean))
## Adding missing grouping variables: `adopter`
## # A tibble: 2 x 14
## adopter age male friend_cnt avg_friend_male avg_friend_age friend_country_~
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 25.7 0.637 35.7 0.646 25.9 6.96
## 2 1 26.6 0.714 51.4 0.645 26.2 9.04
## # ... with 7 more variables: songsListened <dbl>, lovedTracks <dbl>,
## # posts <dbl>, playlists <dbl>, shouts <dbl>, tenure <dbl>,
## # good_country <dbl>
lapply(df_cov, function(v) {
t.test(matched_df[, v] ~ matched_df$adopter)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -5.704, df = 3497.9, p-value = 1.267e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.174752 -0.573742
## sample estimates:
## mean in group 0 mean in group 1
## 25.73723 26.61147
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -7.9836, df = 3551.6, p-value = 1.902e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09554116 -0.05786694
## sample estimates:
## mean in group 0 mean in group 1
## 0.6368115 0.7135156
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -5.793, df = 2920, p-value = 7.651e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -21.11279 -10.43479
## sample estimates:
## mean in group 0 mean in group 1
## 35.65863 51.43242
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = 0.085951, df = 3736.5, p-value = 0.9315
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.008801474 0.009608553
## sample estimates:
## mean in group 0 mean in group 1
## 0.6455297 0.6451262
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -2.5546, df = 3643, p-value = 0.01067
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.53089347 -0.06984082
## sample estimates:
## mean in group 0 mean in group 1
## 25.93412 26.23449
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -10.471, df = 3133.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.466787 -1.688648
## sample estimates:
## mean in group 0 mean in group 1
## 6.963869 9.041586
##
##
## [[7]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -11.921, df = 3102.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -13660.864 -9801.889
## sample estimates:
## mean in group 0 mean in group 1
## 28997.48 40728.86
##
##
## [[8]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -15.111, df = 2982.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -192.5116 -148.2898
## sample estimates:
## mean in group 0 mean in group 1
## 157.4291 327.8298
##
##
## [[9]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -3.3168, df = 2905.4, p-value = 0.0009216
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -27.519865 -7.071076
## sample estimates:
## mean in group 0 mean in group 1
## 11.11385 28.40932
##
##
## [[10]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -5.3781, df = 2792.5, p-value = 8.15e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4249584 -0.1978778
## sample estimates:
## mean in group 0 mean in group 1
## 0.6670186 0.9784367
##
##
## [[11]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -2.7448, df = 2618.3, p-value = 0.006096
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -124.52903 -20.74582
## sample estimates:
## mean in group 0 mean in group 1
## 59.92592 132.56334
##
##
## [[12]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = -1.4857, df = 3410.5, p-value = 0.1374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.4296876 0.1970167
## sample estimates:
## mean in group 0 mean in group 1
## 47.04481 47.66115
##
##
## [[13]]
##
## Welch Two Sample t-test
##
## data: matched_df[, v] by matched_df$adopter
## t = 7.0252, df = 3536.2, p-value = 2.553e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.04887349 0.08671440
## sample estimates:
## mean in group 0 mean in group 1
## 0.3596692 0.2918752
Third method to assess the covariate balance in matched sample: estimating treatment efffect using OLS with or without covarities
lm_treat1 <- lm(adopter ~ subscriber_friend_cnt, data = matched_df)
summary(lm_treat1)
##
## Call:
## lm(formula = adopter ~ subscriber_friend_cnt, data = matched_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.17754 -0.17754 -0.08684 -0.08684 0.91316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.086837 0.003387 25.64 <2e-16 ***
## subscriber_friend_cnt 0.090705 0.004790 18.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3357 on 19644 degrees of freedom
## Multiple R-squared: 0.01793, Adjusted R-squared: 0.01788
## F-statistic: 358.7 on 1 and 19644 DF, p-value: < 2.2e-16
lm_treat2 <- lm(adopter ~ subscriber_friend_cnt + age + male + good_country + friend_cnt + avg_friend_age
+ avg_friend_male + friend_country_cnt + songsListened + lovedTracks + posts + playlists + shouts + tenure, data = matched_df)
summary(lm_treat2)
##
## Call:
## lm(formula = adopter ~ subscriber_friend_cnt + age + male + good_country +
## friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +
## songsListened + lovedTracks + posts + playlists + shouts +
## tenure, data = matched_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27876 -0.15553 -0.10616 -0.05705 1.00012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.618e-02 1.338e-02 -2.705 0.006844 **
## subscriber_friend_cnt 7.663e-02 4.901e-03 15.635 < 2e-16 ***
## age 1.764e-03 4.596e-04 3.838 0.000125 ***
## male 3.070e-02 5.125e-03 5.991 2.12e-09 ***
## good_country -3.889e-02 5.014e-03 -7.756 9.22e-15 ***
## friend_cnt -1.620e-05 3.595e-05 -0.451 0.652317
## avg_friend_age 1.556e-03 5.799e-04 2.684 0.007282 **
## avg_friend_male 7.275e-03 9.820e-03 0.741 0.458791
## friend_country_cnt 1.055e-03 4.454e-04 2.368 0.017884 *
## songsListened 6.209e-07 6.610e-08 9.394 < 2e-16 ***
## lovedTracks 8.509e-05 5.971e-06 14.250 < 2e-16 ***
## posts 2.819e-05 1.355e-05 2.081 0.037461 *
## playlists 7.252e-03 1.407e-03 5.155 2.56e-07 ***
## shouts 1.295e-05 4.562e-06 2.838 0.004538 **
## tenure -3.098e-04 1.322e-04 -2.344 0.019095 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3304 on 19631 degrees of freedom
## Multiple R-squared: 0.04936, Adjusted R-squared: 0.04868
## F-statistic: 72.81 on 14 and 19631 DF, p-value: < 2.2e-16
Run logistic regression to test which variables (including subscriber friends) are significant for explaining the likelihood of becoming an adopter.
hn_lr <- glm(adopter ~ male + age + subscriber_friend_cnt + friend_cnt + avg_friend_age + friend_country_cnt + songsListened + lovedTracks + good_country + playlists + tenure + shouts + posts + avg_friend_male,
family = binomial(), data = df)
summary(hn_lr)
##
## Call:
## glm(formula = adopter ~ male + age + subscriber_friend_cnt +
## friend_cnt + avg_friend_age + friend_country_cnt + songsListened +
## lovedTracks + good_country + playlists + tenure + shouts +
## posts + avg_friend_male, family = binomial(), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6288 -0.3990 -0.3240 -0.2678 2.7604
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.213e+00 9.562e-02 -44.062 < 2e-16 ***
## male 4.139e-01 4.175e-02 9.914 < 2e-16 ***
## age 2.103e-02 3.517e-03 5.979 2.24e-09 ***
## subscriber_friend_cnt 9.719e-01 4.211e-02 23.080 < 2e-16 ***
## friend_cnt -4.584e-04 2.972e-04 -1.543 0.122942
## avg_friend_age 2.369e-02 4.637e-03 5.108 3.25e-07 ***
## friend_country_cnt 1.401e-02 3.646e-03 3.843 0.000122 ***
## songsListened 6.152e-06 5.212e-07 11.805 < 2e-16 ***
## lovedTracks 6.148e-04 4.828e-05 12.734 < 2e-16 ***
## good_country -3.939e-01 4.077e-02 -9.661 < 2e-16 ***
## playlists 6.467e-02 1.310e-02 4.938 7.89e-07 ***
## tenure -4.929e-03 1.024e-03 -4.812 1.49e-06 ***
## shouts 7.416e-05 6.476e-05 1.145 0.252113
## posts 1.074e-04 9.027e-05 1.189 0.234260
## avg_friend_male 1.047e-01 6.555e-02 1.597 0.110222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24537 on 43826 degrees of freedom
## Residual deviance: 22198 on 43812 degrees of freedom
## AIC: 22228
##
## Number of Fisher Scoring iterations: 5
Optimize model to include only significant variables
hn_lr_opt <- glm(adopter ~ male + age + subscriber_friend_cnt + avg_friend_age + friend_country_cnt + songsListened + lovedTracks + good_country + playlists + tenure,
family = binomial(), data = df)
summary(hn_lr_opt)
##
## Call:
## glm(formula = adopter ~ male + age + subscriber_friend_cnt +
## avg_friend_age + friend_country_cnt + songsListened + lovedTracks +
## good_country + playlists + tenure, family = binomial(), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6465 -0.3981 -0.3235 -0.2683 2.7675
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.163e+00 9.122e-02 -45.638 < 2e-16 ***
## male 4.091e-01 4.163e-02 9.828 < 2e-16 ***
## age 2.041e-02 3.506e-03 5.822 5.82e-09 ***
## subscriber_friend_cnt 9.794e-01 4.183e-02 23.414 < 2e-16 ***
## avg_friend_age 2.502e-02 4.555e-03 5.492 3.98e-08 ***
## friend_country_cnt 1.062e-02 2.499e-03 4.250 2.14e-05 ***
## songsListened 6.307e-06 5.167e-07 12.205 < 2e-16 ***
## lovedTracks 6.215e-04 4.818e-05 12.899 < 2e-16 ***
## good_country -3.963e-01 4.076e-02 -9.722 < 2e-16 ***
## playlists 6.465e-02 1.304e-02 4.957 7.14e-07 ***
## tenure -4.798e-03 1.023e-03 -4.691 2.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24537 on 43826 degrees of freedom
## Residual deviance: 22208 on 43816 degrees of freedom
## AIC: 22230
##
## Number of Fisher Scoring iterations: 5
Use exp function to enhance interpretability of coefficients
exp(hn_lr_opt$coefficients)
## (Intercept) male age
## 0.01555748 1.50543746 1.02062333
## subscriber_friend_cnt avg_friend_age friend_country_cnt
## 2.66293682 1.02533301 1.01067806
## songsListened lovedTracks good_country
## 1.00000631 1.00062174 0.67282465
## playlists tenure
## 1.06678846 0.99521304