High Note Propensity Score Matching

Import csv file into RStudio

getwd()

## [1] "C:/Users/sebas/OneDrive/Desktop/BANA270_Midterm_Kathleen Sebastian"

setwd("C:/Users/sebas/OneDrive/Desktop/BANA270_Midterm_Kathleen Sebastian")
df <- read.csv("HighNote Data Midterm.csv")
head(df)

##   ID age male friend_cnt avg_friend_age avg_friend_male friend_country_cnt
## 1  1  22    0          8       22.57143       0.4285714                  1
## 2  2  35    0          2       28.00000       1.0000000                  2
## 3  3  27    1          2       23.00000       1.0000000                  1
## 4  4  21    0         28       22.94737       0.5000000                  7
## 5  5  24    0         65       22.28302       0.9137931                  9
## 6  6  21    1         12       25.00000       0.7777778                  1
##   subscriber_friend_cnt songsListened lovedTracks posts playlists shouts
## 1                     0          9687         194     0         1      8
## 2                     0             0           0     0         0      0
## 3                     0           508           0     0         1      2
## 4                     1          1357          32     0         0      1
## 5                     0         89984          20     2         0     81
## 6                     0        124547          10     0         1      2
##   adopter tenure good_country
## 1       0     59            1
## 2       0     35            0
## 3       0     42            0
## 4       0     25            0
## 5       0     67            0
## 6       0     53            1

Install and/or load necessary packages for data analysis

library(ggplot2)
library(ggcorrplot)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(MatchIt)
library(tidyverse)

## -- Attaching packages ------------------------------------------ tidyverse 1.3.0 --

## v tibble  2.1.3     v purrr   0.3.3
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(purrr)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(ggthemes)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(wesanderson)
library(e1071)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:e1071':
## 
##     impute

## The following object is masked from 'package:psych':
## 
##     describe

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

Generate descriptive statisticsfor HighNote dataset by adopter variable

describe.by(df, group=df$adopter)

## Warning: describe.by is deprecated. Please use the describeBy function

## 
##  Descriptive statistics by group 
## group: 0
##                       vars     n     mean       sd   median  trimmed      mad
## ID                       1 40300 20150.50 11633.75 20150.50 20150.50 14937.19
## age                      2 40300    23.95     6.37    23.00    23.09     4.45
## male                     3 40300     0.62     0.48     1.00     0.65     0.00
## friend_cnt               4 40300    18.49    57.48     7.00    10.28     7.41
## avg_friend_age           5 40300    24.01     5.10    23.00    23.40     3.95
## avg_friend_male          6 40300     0.62     0.32     0.67     0.65     0.35
## friend_country_cnt       7 40300     3.96     5.76     2.00     2.66     1.48
## subscriber_friend_cnt    8 40300     0.42     2.42     0.00     0.13     0.00
## songsListened            9 40300 17589.44 28416.02  7440.00 11817.64 10576.87
## lovedTracks             10 40300    86.82   263.58    14.00    36.35    20.76
## posts                   11 40300     5.29   104.31     0.00     0.23     0.00
## playlists               12 40300     0.55     1.07     0.00     0.45     0.00
## shouts                  13 40300    29.97   150.69     4.00     8.84     4.45
## adopter                 14 40300     0.00     0.00     0.00     0.00     0.00
## tenure                  15 40300    43.81    19.79    44.00    43.72    22.24
## good_country            16 40300     0.36     0.48     0.00     0.32     0.00
##                       min     max   range  skew kurtosis     se
## ID                      1   40300   40299  0.00    -1.20  57.95
## age                     8      79      71  1.97     6.80   0.03
## male                    0       1       1 -0.50    -1.75   0.00
## friend_cnt              1    4957    4956 32.67  2087.42   0.29
## avg_friend_age          8      77      69  1.84     7.15   0.03
## avg_friend_male         0       1       1 -0.52    -0.72   0.00
## friend_country_cnt      0     129     129  4.74    38.29   0.03
## subscriber_friend_cnt   0     309     309 72.19  8024.62   0.01
## songsListened           0 1000000 1000000  6.05   105.85 141.55
## lovedTracks             0   12522   12522 13.12   335.93   1.31
## posts                   0   12309   12309 73.92  7005.34   0.52
## playlists               0      98      98 28.21  1945.28   0.01
## shouts                  0    7736    7736 22.53   779.12   0.75
## adopter                 0       0       0   NaN      NaN   0.00
## tenure                  1     111     110  0.05    -0.70   0.10
## good_country            0       1       1  0.59    -1.65   0.00
## ------------------------------------------------------------ 
## group: 1
##                       vars    n     mean       sd   median  trimmed      mad
## ID                       1 3527 42064.00  1018.30 42064.00 42064.00  1307.65
## age                      2 3527    25.98     6.84    24.00    25.05     4.45
## male                     3 3527     0.73     0.44     1.00     0.79     0.00
## friend_cnt               4 3527    39.73   117.27    16.00    23.69    17.79
## avg_friend_age           5 3527    25.44     5.21    24.36    24.83     3.91
## avg_friend_male          6 3527     0.64     0.25     0.67     0.65     0.25
## friend_country_cnt       7 3527     7.19     8.86     4.00     5.36     4.45
## subscriber_friend_cnt    8 3527     1.64     5.85     0.00     0.84     0.00
## songsListened            9 3527 33758.04 43592.73 20908.00 25811.69 23276.82
## lovedTracks             10 3527   264.34   491.43   108.00   161.68   140.85
## posts                   11 3527    21.20   221.99     0.00     1.44     0.00
## playlists               12 3527     0.90     2.56     1.00     0.59     1.48
## shouts                  13 3527    99.44  1156.07     9.00    23.89    11.86
## adopter                 14 3527     1.00     0.00     1.00     1.00     0.00
## tenure                  15 3527    45.58    20.04    46.00    45.60    20.76
## good_country            16 3527     0.29     0.45     0.00     0.23     0.00
##                         min    max  range  skew kurtosis     se
## ID                    40301  43827   3526  0.00    -1.20  17.15
## age                       8     73     65  1.68     4.39   0.12
## male                      0      1      1 -1.03    -0.94   0.01
## friend_cnt                1   5089   5088 26.04  1013.79   1.97
## avg_friend_age           12     62     50  1.68     5.05   0.09
## avg_friend_male           0      1      1 -0.54    -0.05   0.00
## friend_country_cnt        0    136    136  3.61    24.53   0.15
## subscriber_friend_cnt     0    287    287 34.05  1609.52   0.10
## songsListened             0 817290 817290  4.71    46.64 734.03
## lovedTracks               0  10220  10220  6.52    80.96   8.27
## posts                     0   8506   8506 26.52   852.38   3.74
## playlists                 0    118    118 28.84  1244.31   0.04
## shouts                    0  65872  65872 52.52  2969.09  19.47
## adopter                   1      1      0   NaN      NaN   0.00
## tenure                    0    111    111  0.02    -0.62   0.34
## good_country              0      1      1  0.94    -1.12   0.01

Conduct t-test to identify differences in means, selecting ONLY relevant variables from the dataset

lapply(df[,c('age','male' , 'friend_cnt' , 'avg_friend_male' ,'avg_friend_age', 
                   'friend_country_cnt'  , 'songsListened' , 'lovedTracks' , 
                   'posts' , 'playlists' ,'shouts' , 'tenure' ,'good_country', 'subscriber_friend_cnt')], function(a) t.test(a ~ df$adopter))

## $age
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -16.996, df = 4079.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.265768 -1.797097
## sample estimates:
## mean in group 0 mean in group 1 
##        23.94844        25.97987 
## 
## 
## $male
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -13.654, df = 4295, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.12278707 -0.09195413
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6218610       0.7292316 
## 
## 
## $friend_cnt
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -10.646, df = 3675.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -25.15422 -17.32999
## sample estimates:
## mean in group 0 mean in group 1 
##        18.49166        39.73377 
## 
## 
## $avg_friend_male
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -4.4426, df = 4591.6, p-value = 9.097e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02883955 -0.01117951
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6165888       0.6365983 
## 
## 
## $avg_friend_age
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -15.658, df = 4140.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.608931 -1.250852
## sample estimates:
## mean in group 0 mean in group 1 
##        24.01142        25.44131 
## 
## 
## $friend_country_cnt
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -21.267, df = 3791.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.528795 -2.933081
## sample estimates:
## mean in group 0 mean in group 1 
##        3.957891        7.188829 
## 
## 
## $songsListened
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -21.629, df = 3792.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17634.24 -14702.96
## sample estimates:
## mean in group 0 mean in group 1 
##        17589.44        33758.04 
## 
## 
## $lovedTracks
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -21.188, df = 3705.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -193.9447 -161.0917
## sample estimates:
## mean in group 0 mean in group 1 
##        86.82263       264.34080 
## 
## 
## $posts
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -4.2151, df = 3663.5, p-value = 2.557e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -23.30665  -8.50825
## sample estimates:
## mean in group 0 mean in group 1 
##        5.293002       21.200454 
## 
## 
## $playlists
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -8.0816, df = 3634.7, p-value = 8.619e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4367565 -0.2662138
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5492804       0.9007655 
## 
## 
## $shouts
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -3.5659, df = 3536.5, p-value = 0.0003674
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -107.66170  -31.27249
## sample estimates:
## mean in group 0 mean in group 1 
##        29.97266        99.43975 
## 
## 
## $tenure
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -5.0434, df = 4150.6, p-value = 4.768e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.462620 -1.083959
## sample estimates:
## mean in group 0 mean in group 1 
##        43.80993        45.58322 
## 
## 
## $good_country
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = 8.8009, df = 4248.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05463587 0.08595434
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3577916       0.2874965 
## 
## 
## $subscriber_friend_cnt
## 
##  Welch Two Sample t-test
## 
## data:  a by df$adopter
## t = -12.287, df = 3632.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.413899 -1.024766
## sample estimates:
## mean in group 0 mean in group 1 
##        0.417469        1.636802

Generate relevant visualizations pertaining to the original HighNote dataset Correlation Matrix

corr <- round(cor(df), 1)
ggcorrplot(corr, hc.order = TRUE, type = "lower",
           lab = TRUE)

Create new dataframe for visualizations purposes, converting adopter variable to a character

df1 <- df
df1$adopter <- as.character(df1$adopter)

VISUALIZATIONS BY DEMOGRAPHIC DATA age

ggplot(df1, aes(x=adopter, y=age, group=adopter, fill=adopter,)) + geom_boxplot() +
  labs(title="Boxplot of Adopter vs. Non-Adopter by Age",x="Adopter", y = "Age") + 
  scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest1"))

male

ggplot(df1, aes(x=male, group=adopter, fill=adopter)) + geom_bar(aes(fill=adopter))+
  labs(title="Barchart of Adopter vs. Non-Adopter by Gender",x="Gender") + 
  scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest1"))

good_country

ggplot(df1, aes(x=good_country, group=adopter, fill=adopter)) + geom_bar(aes(fill=adopter))+
  labs(title="Barchart of Adopter vs. Non-Adopter by Good Country",x="Good Country") + 
  scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest1"))

VISUALIZATIONS BY PEER INFLUENCE friend_cnt

ggplot(df1, aes(x=age, y=friend_cnt, col=adopter)) + geom_point(alpha=0.5) + 
  scale_fill_manual(values=wes_palette(name="GrandBudapest1")) + labs(title="Scatter of Age vs. Friend Count by Adopter vs. Non Adopter")

avg_friend_age

ggplot(df1,aes(x=avg_friend_age,group=adopter,fill=adopter))+
  geom_histogram(position="identity",alpha=0.8,binwidth=1.0)+labs(title="Histogram of Average Friend Age by Adopter vs. Non.Adopter")+
  scale_fill_manual(values=wes_palette(name="GrandBudapest1"))

avg_friend_male

ggplot(df1,aes(x=avg_friend_age,group=adopter,fill=adopter))+
  geom_histogram(position="identity",alpha=0.8,binwidth=1.0)+labs(title="Histogram of Average Friend Age by Adopter vs. Non.Adopter")+
  scale_fill_manual(values=wes_palette(name="GrandBudapest1"))

friend_country_cnt

ggplot(df1, aes(x=adopter, y=friend_country_cnt)) + 
  geom_bar(aes(fill = adopter), position = "dodge", stat="identity") + 
  scale_fill_manual(values=wes_palette(name="GrandBudapest1")) +
  labs(title="Barchart for Friend Country Count by Adopter vs. Non-Adopter")

subscriber_friend_cnt

avg_subs_friend_cnt <- df1 %>%
  group_by(adopter)%>%
  summarise(subscriber_friend_cnt=mean(subscriber_friend_cnt))

ggplot(avg_subs_friend_cnt, aes(x=avg_subs_friend_cnt$adopter, y=avg_subs_friend_cnt$subscriber_friend_cnt)) + 
         geom_bar(aes(fill = adopter), position = "dodge", stat="identity") + 
         scale_fill_manual(values=wes_palette(name="GrandBudapest1")) +
         labs(title="Barchart for Average Subscriber Friend Count by Adopter vs. Non-Adopter",x="Adopter",y="Average Subscriber Friend Count")

Pre-analysis using non-matched data DUMMY VARIABLE CREATION: Group to create dataframe by subscriber variable count variable, where if subscriber_friend_cnt >- 1, then this is 1 or the treatment group, and 0 if subscriber subscriber_friend_cnt = 0 or the control group

df$subscriber_friend_cnt <- ifelse(df$subscriber_friend_cnt >=1,1,0)
with(df, t.test(subscriber_friend_cnt ~ adopter))

## 
##  Welch Two Sample t-test
## 
## data:  subscriber_friend_cnt by adopter
## t = -33.978, df = 3931.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3109641 -0.2770354
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2004715       0.4944712

Difference in means for pre-treatment covarities

df_cov <- c('age','male' , 'friend_cnt' , 'avg_friend_male' ,'avg_friend_age', 
            'friend_country_cnt'  , 'songsListened' , 'lovedTracks' , 
            'posts' , 'playlists' ,'shouts' , 'tenure' ,'good_country')
diff_in_means_covariteis <- df %>%
  group_by(adopter) %>%
  select(one_of(df_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))

## Adding missing grouping variables: `adopter`

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.

Test whether means are significant

with(df, t.test(age ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  age by subscriber_friend_cnt
## t = -20.841, df = 14645, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.778544 -1.472749
## sample estimates:
## mean in group 0 mean in group 1 
##        23.74756        25.37321

with(df, t.test(friend_cnt ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  friend_cnt by subscriber_friend_cnt
## t = -33.707, df = 9903.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -46.12459 -41.05469
## sample estimates:
## mean in group 0 mean in group 1 
##        10.43133        54.02097

with(df, t.test(male ~ subscriber_friend_cnt))

## 
##  Welch Two Sample t-test
## 
## data:  male by subscriber_friend_cnt
## t = -1.3459, df = 15986, p-value = 0.1784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.018236129  0.003388028
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6288378       0.6362618

with(df, t.test(tenure ~ subscriber_friend_cnt))

## 
##  Welch Two Sample t-test
## 
## data:  tenure by subscriber_friend_cnt
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1 
##        43.20268        46.54871

with(df, t.test(good_country ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  good_country by subscriber_friend_cnt
## t = 2.0956, df = 16030, p-value = 0.03613
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0007383591 0.0220968020
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3546936       0.3432760

with(df, t.test(songsListened ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  songsListened by subscriber_friend_cnt
## t = -41.505, df = 11447, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -20037.04 -18229.80
## sample estimates:
## mean in group 0 mean in group 1 
##        14602.22        33735.64

with(df, t.test(lovedTracks ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  lovedTracks by subscriber_friend_cnt
## t = -31.265, df = 10585, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -170.1918 -150.1102
## sample estimates:
## mean in group 0 mean in group 1 
##        65.21365       225.36465

with(df, t.test(posts ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  posts by subscriber_friend_cnt
## t = -7.3649, df = 9933.6, p-value = 1.914e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22.76492 -13.19424
## sample estimates:
## mean in group 0 mean in group 1 
##        2.543377       20.522956

with(df, t.test(tenure ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  tenure by subscriber_friend_cnt
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1 
##        43.20268        46.54871

with(df, t.test(avg_friend_male ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  avg_friend_male by subscriber_friend_cnt
## t = -7.7114, df = 23020, p-value = 1.294e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02846397 -0.01692672
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6131124       0.6358077

with(df, t.test(avg_friend_age ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  avg_friend_age by subscriber_friend_cnt
## t = -27.658, df = 15667, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.744514 -1.513611
## sample estimates:
## mean in group 0 mean in group 1 
##        23.76137        25.39043

with(df, t.test(friend_country_cnt ~ subscriber_friend_cnt ))

## 
##  Welch Two Sample t-test
## 
## data:  friend_country_cnt by subscriber_friend_cnt
## t = -65.05, df = 10372, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.861271 -6.459857
## sample estimates:
## mean in group 0 mean in group 1 
##        2.725062        9.385626

Propensity Score Matching; given all means significant for all, use all features related to the outcome variable

hn_ps <- glm(subscriber_friend_cnt ~ age + male + good_country + avg_friend_age + avg_friend_male + friend_country_cnt + songsListened +
              lovedTracks + posts + playlists + shouts + tenure + friend_cnt,
            family = binomial(), data = df)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(hn_ps)

## 
## Call:
## glm(formula = subscriber_friend_cnt ~ age + male + good_country + 
##     avg_friend_age + avg_friend_male + friend_country_cnt + songsListened + 
##     lovedTracks + posts + playlists + shouts + tenure + friend_cnt, 
##     family = binomial(), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.4206  -0.5671  -0.4220  -0.3001   2.5619  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -5.144e+00  7.703e-02 -66.782  < 2e-16 ***
## age                 1.970e-02  2.808e-03   7.015 2.30e-12 ***
## male                4.311e-02  2.998e-02   1.438 0.150419    
## good_country        3.201e-02  2.922e-02   1.096 0.273235    
## avg_friend_age      7.955e-02  3.481e-03  22.850  < 2e-16 ***
## avg_friend_male     2.514e-01  5.029e-02   4.999 5.75e-07 ***
## friend_country_cnt  1.110e-01  4.765e-03  23.302  < 2e-16 ***
## songsListened       6.906e-06  5.156e-07  13.396  < 2e-16 ***
## lovedTracks         6.671e-04  5.645e-05  11.817  < 2e-16 ***
## posts               5.699e-04  2.682e-04   2.125 0.033613 *  
## playlists           5.639e-03  1.190e-02   0.474 0.635530    
## shouts             -4.909e-05  3.707e-05  -1.324 0.185434    
## tenure             -2.571e-03  7.769e-04  -3.309 0.000935 ***
## friend_cnt          3.132e-02  1.034e-03  30.301  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 46640  on 43826  degrees of freedom
## Residual deviance: 34170  on 43813  degrees of freedom
## AIC: 34198
## 
## Number of Fisher Scoring iterations: 7

Use model to calculate propensity score and generate new dataframe

ps_df <- data.frame(pr_score = predict(hn_ps, type = "response"),
                    subscriber_friend_cnt = hn_ps$model$subscriber_friend_cnt)
head(ps_df)

##     pr_score subscriber_friend_cnt
## 1 0.08597334                     0
## 2 0.14417767                     0
## 3 0.08217010                     0
## 4 0.23894067                     1
## 5 0.69552208                     0
## 6 0.22306633                     0

Conduct examiniation of common support by creating a histogram to plot the respective propensity scores by treatment status

labs <- paste("HighNote User Type:", c("Treatment Group", "Control Group"))
ps_df %>%
  mutate(adopter = ifelse(subscriber_friend_cnt == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(fill = "plum", color = "black") +
  facet_wrap(~adopter) +
  xlab("Probability of User Being in Treatment Group") +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Use matchit package to identify pairs of observations with similar propensity scores but are distinct in treatment status.

df_nomissingvals <- df %>% select(subscriber_friend_cnt, adopter, one_of(df_cov)) %>%
  na.omit()
head(df_nomissingvals)

##   subscriber_friend_cnt adopter age male friend_cnt avg_friend_male
## 1                     0       0  22    0          8       0.4285714
## 2                     0       0  35    0          2       1.0000000
## 3                     0       0  27    1          2       1.0000000
## 4                     1       0  21    0         28       0.5000000
## 5                     0       0  24    0         65       0.9137931
## 6                     0       0  21    1         12       0.7777778
##   avg_friend_age friend_country_cnt songsListened lovedTracks posts playlists
## 1       22.57143                  1          9687         194     0         1
## 2       28.00000                  2             0           0     0         0
## 3       23.00000                  1           508           0     0         1
## 4       22.94737                  7          1357          32     0         0
## 5       22.28302                  9         89984          20     2         0
## 6       25.00000                  1        124547          10     0         1
##   shouts tenure good_country
## 1      8     59            1
## 2      0     35            0
## 3      2     42            0
## 4      1     25            0
## 5     81     67            0
## 6      2     53            1

mod_match <- matchit(subscriber_friend_cnt ~ age + male + good_country + avg_friend_age + avg_friend_male + friend_country_cnt + songsListened +
                       lovedTracks + posts + playlists + shouts + tenure + friend_cnt,
                     method = "nearest", data = df_nomissingvals)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(mod_match)

## 
## Call:
## matchit(formula = subscriber_friend_cnt ~ age + male + good_country + 
##     avg_friend_age + avg_friend_male + friend_country_cnt + songsListened + 
##     lovedTracks + posts + playlists + shouts + tenure + friend_cnt, 
##     data = df_nomissingvals, method = "nearest")
## 
## Summary of balance for all data:
##                    Means Treated Means Control SD Control  Mean Diff    eQQ Med
## distance                  0.4635        0.1550     0.1436     0.3086     0.2506
## age                      25.3732       23.7476     6.2245     1.6256     1.0000
## male                      0.6363        0.6288     0.4831     0.0074     0.0000
## good_country              0.3433        0.3547     0.4784    -0.0114     0.0000
## avg_friend_age           25.3904       23.7614     5.0577     1.6291     1.5909
## avg_friend_male           0.6358        0.6131     0.3343     0.0227     0.0738
## friend_country_cnt        9.3856        2.7251     3.1024     6.6606     5.0000
## songsListened         33735.6404    14602.2205 23214.2898 19133.4199 15471.0000
## lovedTracks             225.3647       65.2137   181.4812   160.1510    65.0000
## posts                    20.5230        2.5434    33.7947    17.9796     0.0000
## playlists                 0.7441        0.5295     0.9673     0.2146     0.0000
## shouts                  101.8195       16.4230    79.7381    85.3965    15.0000
## tenure                   46.5487       43.2027    19.7212     3.3460     3.0000
## friend_cnt               54.0210       10.4313    15.2769    43.5896    22.0000
##                      eQQ Mean     eQQ Max
## distance               0.3086      0.6840
## age                    1.6296      5.0000
## male                   0.0074      1.0000
## good_country           0.0114      1.0000
## avg_friend_age         1.6369     11.5000
## avg_friend_male        0.0958      0.3636
## friend_country_cnt     6.6598     95.0000
## songsListened      19126.1623 653702.0000
## lovedTracks          159.9562   6343.0000
## posts                 17.8829   9535.0000
## playlists              0.2092     26.0000
## shouts                85.1764  59168.0000
## tenure                 3.3473     10.0000
## friend_cnt            43.5838   4794.0000
## 
## 
## Summary of balance for matched data:
##                    Means Treated Means Control SD Control Mean Diff   eQQ Med
## distance                  0.4635        0.3040     0.1913    0.1596    0.1077
## age                      25.3732       26.3324     7.9056   -0.9592    1.0000
## male                      0.6363        0.6576     0.4745   -0.0214    0.0000
## good_country              0.3433        0.3581     0.4795   -0.0149    0.0000
## avg_friend_age           25.3904       26.5572     6.7320   -1.1668    0.4376
## avg_friend_male           0.6358        0.6551     0.2643   -0.0193    0.0158
## friend_country_cnt        9.3856        5.0914     4.6473    4.2942    2.0000
## songsListened         33735.6404    27360.8630 33892.7804 6374.7775 4680.0000
## lovedTracks             225.3647      134.5440   299.1995   90.8206   38.0000
## posts                    20.5230        6.2773    60.2598   14.2456    0.0000
## playlists                 0.7441        0.6723     1.4015    0.0718    0.0000
## shouts                  101.8195       37.2362   138.8781   64.5833   10.0000
## tenure                   46.5487       47.7039    19.0357   -1.1551    1.0000
## friend_cnt               54.0210       21.4666    23.5251   32.5544   12.0000
##                     eQQ Mean     eQQ Max
## distance              0.1596      0.4517
## age                   0.9592      7.0000
## male                  0.0214      1.0000
## good_country          0.0149      1.0000
## avg_friend_age        1.2763     14.0000
## avg_friend_male       0.0326      0.1602
## friend_country_cnt    4.2942     95.0000
## songsListened      6374.7775 566867.0000
## lovedTracks          90.8206   6180.0000
## posts                14.2456   9535.0000
## playlists             0.1035     22.0000
## shouts               64.5833  59168.0000
## tenure                1.2995      4.0000
## friend_cnt           32.5544   4794.0000
## 
## Percent Balance Improvement:
##                    Mean Diff. eQQ Med  eQQ Mean  eQQ Max
## distance              48.2930 57.0083   48.2908  33.9658
## age                   40.9972  0.0000   41.1419 -40.0000
## male                -187.9614  0.0000 -187.6712   0.0000
## good_country         -30.1771  0.0000  -30.3571   0.0000
## avg_friend_age        28.3760 72.4916   22.0309 -21.7391
## avg_friend_male       14.7957 78.6165   65.9532  55.9466
## friend_country_cnt    35.5279 60.0000   35.5203   0.0000
## songsListened         66.6825 69.7499   66.6699  13.2836
## lovedTracks           43.2906 41.5385   43.2216   2.5698
## posts                 20.7676  0.0000   20.3394   0.0000
## playlists             66.5567  0.0000   50.5109  15.3846
## shouts                24.3724 33.3333   24.1770   0.0000
## tenure                65.4771 66.6667   61.1782  60.0000
## friend_cnt            25.3162 45.4545   25.3062   0.0000
## 
## Sample sizes:
##           Control Treated
## All         34004    9823
## Matched      9823    9823
## Unmatched   24181       0
## Discarded       0       0

plot(mod_match)

plot(mod_match,type="hist")

Generate a new dataframe containing matched values

matched_df <- match.data(mod_match)
head(matched_df)

##   subscriber_friend_cnt adopter age male friend_cnt avg_friend_male
## 1                     0       0  22    0          8       0.4285714
## 4                     1       0  21    0         28       0.5000000
## 5                     0       0  24    0         65       0.9137931
## 6                     0       0  21    1         12       0.7777778
## 7                     0       0  20    0         15       0.6363636
## 8                     1       0  23    1         57       0.5208333
##   avg_friend_age friend_country_cnt songsListened lovedTracks posts playlists
## 1       22.57143                  1          9687         194     0         1
## 4       22.94737                  7          1357          32     0         0
## 5       22.28302                  9         89984          20     2         0
## 6       25.00000                  1        124547          10     0         1
## 7       22.00000                  1         24852         391     6         1
## 8       23.63636                 14         99877         125    89         1
##   shouts tenure good_country   distance weights
## 1      8     59            1 0.08597334       1
## 4      1     25            0 0.23894067       1
## 5     81     67            0 0.69552208       1
## 6      2     53            1 0.22306633       1
## 7     67     56            1 0.12644080       1
## 8     44     71            0 0.79381453       1

dim(matched_df)

## [1] 19646    17

matcheddata_mean  <- matched_df%>%
  group_by(subscriber_friend_cnt) %>%
  select(one_of(df_cov)) %>%
  summarise_all(funs(mean))

## Adding missing grouping variables: `subscriber_friend_cnt`

View(matcheddata_mean)

Second method to assess the covariate balance in matched sample: difference in means test

matched_df%>%
  group_by(adopter) %>%
  select(one_of(df_cov)) %>%
  summarise_all(funs(mean))

## Adding missing grouping variables: `adopter`

## # A tibble: 2 x 14
##   adopter   age  male friend_cnt avg_friend_male avg_friend_age friend_country_~
##     <int> <dbl> <dbl>      <dbl>           <dbl>          <dbl>            <dbl>
## 1       0  25.7 0.637       35.7           0.646           25.9             6.96
## 2       1  26.6 0.714       51.4           0.645           26.2             9.04
## # ... with 7 more variables: songsListened <dbl>, lovedTracks <dbl>,
## #   posts <dbl>, playlists <dbl>, shouts <dbl>, tenure <dbl>,
## #   good_country <dbl>

lapply(df_cov, function(v) {
  t.test(matched_df[, v] ~ matched_df$adopter)
})

## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -5.704, df = 3497.9, p-value = 1.267e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.174752 -0.573742
## sample estimates:
## mean in group 0 mean in group 1 
##        25.73723        26.61147 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -7.9836, df = 3551.6, p-value = 1.902e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09554116 -0.05786694
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6368115       0.7135156 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -5.793, df = 2920, p-value = 7.651e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -21.11279 -10.43479
## sample estimates:
## mean in group 0 mean in group 1 
##        35.65863        51.43242 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = 0.085951, df = 3736.5, p-value = 0.9315
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.008801474  0.009608553
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6455297       0.6451262 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -2.5546, df = 3643, p-value = 0.01067
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.53089347 -0.06984082
## sample estimates:
## mean in group 0 mean in group 1 
##        25.93412        26.23449 
## 
## 
## [[6]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -10.471, df = 3133.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.466787 -1.688648
## sample estimates:
## mean in group 0 mean in group 1 
##        6.963869        9.041586 
## 
## 
## [[7]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -11.921, df = 3102.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13660.864  -9801.889
## sample estimates:
## mean in group 0 mean in group 1 
##        28997.48        40728.86 
## 
## 
## [[8]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -15.111, df = 2982.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -192.5116 -148.2898
## sample estimates:
## mean in group 0 mean in group 1 
##        157.4291        327.8298 
## 
## 
## [[9]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -3.3168, df = 2905.4, p-value = 0.0009216
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -27.519865  -7.071076
## sample estimates:
## mean in group 0 mean in group 1 
##        11.11385        28.40932 
## 
## 
## [[10]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -5.3781, df = 2792.5, p-value = 8.15e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4249584 -0.1978778
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6670186       0.9784367 
## 
## 
## [[11]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -2.7448, df = 2618.3, p-value = 0.006096
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -124.52903  -20.74582
## sample estimates:
## mean in group 0 mean in group 1 
##        59.92592       132.56334 
## 
## 
## [[12]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = -1.4857, df = 3410.5, p-value = 0.1374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.4296876  0.1970167
## sample estimates:
## mean in group 0 mean in group 1 
##        47.04481        47.66115 
## 
## 
## [[13]]
## 
##  Welch Two Sample t-test
## 
## data:  matched_df[, v] by matched_df$adopter
## t = 7.0252, df = 3536.2, p-value = 2.553e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.04887349 0.08671440
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3596692       0.2918752

Third method to assess the covariate balance in matched sample: estimating treatment efffect using OLS with or without covarities

lm_treat1 <- lm(adopter ~ subscriber_friend_cnt, data = matched_df)
summary(lm_treat1)

## 
## Call:
## lm(formula = adopter ~ subscriber_friend_cnt, data = matched_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17754 -0.17754 -0.08684 -0.08684  0.91316 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.086837   0.003387   25.64   <2e-16 ***
## subscriber_friend_cnt 0.090705   0.004790   18.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3357 on 19644 degrees of freedom
## Multiple R-squared:  0.01793,    Adjusted R-squared:  0.01788 
## F-statistic: 358.7 on 1 and 19644 DF,  p-value: < 2.2e-16

lm_treat2 <- lm(adopter ~ subscriber_friend_cnt +  age + male + good_country + friend_cnt + avg_friend_age 
                + avg_friend_male + friend_country_cnt + songsListened + lovedTracks + posts + playlists + shouts + tenure, data = matched_df)
summary(lm_treat2)

## 
## Call:
## lm(formula = adopter ~ subscriber_friend_cnt + age + male + good_country + 
##     friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt + 
##     songsListened + lovedTracks + posts + playlists + shouts + 
##     tenure, data = matched_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.27876 -0.15553 -0.10616 -0.05705  1.00012 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -3.618e-02  1.338e-02  -2.705 0.006844 ** 
## subscriber_friend_cnt  7.663e-02  4.901e-03  15.635  < 2e-16 ***
## age                    1.764e-03  4.596e-04   3.838 0.000125 ***
## male                   3.070e-02  5.125e-03   5.991 2.12e-09 ***
## good_country          -3.889e-02  5.014e-03  -7.756 9.22e-15 ***
## friend_cnt            -1.620e-05  3.595e-05  -0.451 0.652317    
## avg_friend_age         1.556e-03  5.799e-04   2.684 0.007282 ** 
## avg_friend_male        7.275e-03  9.820e-03   0.741 0.458791    
## friend_country_cnt     1.055e-03  4.454e-04   2.368 0.017884 *  
## songsListened          6.209e-07  6.610e-08   9.394  < 2e-16 ***
## lovedTracks            8.509e-05  5.971e-06  14.250  < 2e-16 ***
## posts                  2.819e-05  1.355e-05   2.081 0.037461 *  
## playlists              7.252e-03  1.407e-03   5.155 2.56e-07 ***
## shouts                 1.295e-05  4.562e-06   2.838 0.004538 ** 
## tenure                -3.098e-04  1.322e-04  -2.344 0.019095 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3304 on 19631 degrees of freedom
## Multiple R-squared:  0.04936,    Adjusted R-squared:  0.04868 
## F-statistic: 72.81 on 14 and 19631 DF,  p-value: < 2.2e-16

Run logistic regression to test which variables (including subscriber friends) are significant for explaining the likelihood of becoming an adopter.

hn_lr <- glm(adopter ~ male + age + subscriber_friend_cnt + friend_cnt + avg_friend_age + friend_country_cnt + songsListened + lovedTracks + good_country + playlists + tenure + shouts + posts + avg_friend_male,
             family = binomial(), data = df)
summary(hn_lr)

## 
## Call:
## glm(formula = adopter ~ male + age + subscriber_friend_cnt + 
##     friend_cnt + avg_friend_age + friend_country_cnt + songsListened + 
##     lovedTracks + good_country + playlists + tenure + shouts + 
##     posts + avg_friend_male, family = binomial(), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6288  -0.3990  -0.3240  -0.2678   2.7604  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -4.213e+00  9.562e-02 -44.062  < 2e-16 ***
## male                   4.139e-01  4.175e-02   9.914  < 2e-16 ***
## age                    2.103e-02  3.517e-03   5.979 2.24e-09 ***
## subscriber_friend_cnt  9.719e-01  4.211e-02  23.080  < 2e-16 ***
## friend_cnt            -4.584e-04  2.972e-04  -1.543 0.122942    
## avg_friend_age         2.369e-02  4.637e-03   5.108 3.25e-07 ***
## friend_country_cnt     1.401e-02  3.646e-03   3.843 0.000122 ***
## songsListened          6.152e-06  5.212e-07  11.805  < 2e-16 ***
## lovedTracks            6.148e-04  4.828e-05  12.734  < 2e-16 ***
## good_country          -3.939e-01  4.077e-02  -9.661  < 2e-16 ***
## playlists              6.467e-02  1.310e-02   4.938 7.89e-07 ***
## tenure                -4.929e-03  1.024e-03  -4.812 1.49e-06 ***
## shouts                 7.416e-05  6.476e-05   1.145 0.252113    
## posts                  1.074e-04  9.027e-05   1.189 0.234260    
## avg_friend_male        1.047e-01  6.555e-02   1.597 0.110222    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24537  on 43826  degrees of freedom
## Residual deviance: 22198  on 43812  degrees of freedom
## AIC: 22228
## 
## Number of Fisher Scoring iterations: 5

Optimize model to include only significant variables

hn_lr_opt <- glm(adopter ~ male + age + subscriber_friend_cnt + avg_friend_age + friend_country_cnt + songsListened + lovedTracks + good_country + playlists + tenure,
                          family = binomial(), data = df)
summary(hn_lr_opt)

## 
## Call:
## glm(formula = adopter ~ male + age + subscriber_friend_cnt + 
##     avg_friend_age + friend_country_cnt + songsListened + lovedTracks + 
##     good_country + playlists + tenure, family = binomial(), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6465  -0.3981  -0.3235  -0.2683   2.7675  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -4.163e+00  9.122e-02 -45.638  < 2e-16 ***
## male                   4.091e-01  4.163e-02   9.828  < 2e-16 ***
## age                    2.041e-02  3.506e-03   5.822 5.82e-09 ***
## subscriber_friend_cnt  9.794e-01  4.183e-02  23.414  < 2e-16 ***
## avg_friend_age         2.502e-02  4.555e-03   5.492 3.98e-08 ***
## friend_country_cnt     1.062e-02  2.499e-03   4.250 2.14e-05 ***
## songsListened          6.307e-06  5.167e-07  12.205  < 2e-16 ***
## lovedTracks            6.215e-04  4.818e-05  12.899  < 2e-16 ***
## good_country          -3.963e-01  4.076e-02  -9.722  < 2e-16 ***
## playlists              6.465e-02  1.304e-02   4.957 7.14e-07 ***
## tenure                -4.798e-03  1.023e-03  -4.691 2.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24537  on 43826  degrees of freedom
## Residual deviance: 22208  on 43816  degrees of freedom
## AIC: 22230
## 
## Number of Fisher Scoring iterations: 5

Use exp function to enhance interpretability of coefficients

exp(hn_lr_opt$coefficients)

##           (Intercept)                  male                   age 
##            0.01555748            1.50543746            1.02062333 
## subscriber_friend_cnt        avg_friend_age    friend_country_cnt 
##            2.66293682            1.02533301            1.01067806 
##         songsListened           lovedTracks          good_country 
##            1.00000631            1.00062174            0.67282465 
##             playlists                tenure 
##            1.06678846            0.99521304

High Note Propensity Score Matching

Kathleen S.

Winter Quarter