####################
# Import Libraries #
####################
library(Amelia)
library(grid)
library(gridExtra)
library(tidyverse)
library(lubridate)
library(plotly)
library(factoextra)
library(reshape)
##############################
# Graphics Themes for GGplot #
##############################
my_theme <- function(base_size = 10, base_family = "sans"){
theme_minimal(base_size = base_size, base_family = base_family) +
theme(
axis.text = element_text(size = 10),
axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
axis.title = element_text(size = 12),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#ffffef"),
strip.background = element_rect(fill = "#ffbb00", color = "black", size =0.5),
strip.text = element_text(face = "bold", size = 10, color = "black"),
legend.position = "bottom",
legend.justification = "center",
legend.background = element_blank(),
panel.border = element_rect(color = "grey30", fill = NA, size = 0.5)
)
}
theme_set(my_theme())
mycolors=c("#f32440","#ffd700","#ff8c00","#c9e101","#c100e6","#39d3d6",
"#e84412", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#D1E5F0",
"#999999", "#E69F00", "#56B4E9")
Channel KLMN is based in Los Angeles and airs a weekly national political talk show creatively called US Politics This Week. Their only TV commercial advertising US Politics This Week features the Mayor of Los Angeles.
To understand and verify if there is a lower fraction of people who saw the commercials with their local Mayor watched US Politics This Week as compared to people who saw the commercial with the Mayor of Los Angeles.
The Dataset is from the Channel KLMN. This data is filled with two data sets which are test_data.csv and viewer_data. The first one data set has 9 features and the second one has 4 features.
sales components in 3 supemarket branches and includes 17 variables.
####################
# Read the DataSet #
###################
setwd("C:/Wecsley/USA/Samba_TV")
test_data <- read.csv("test_data.csv")
viewer_data <- read.csv("viewer_data.csv")
The test_data information.
##############################
# test_data data information #
##############################
head(test_data)
| viewer_id | date | tv_make | tv_size | uhd_capable | tv_provider | total_time_watched | watched | test |
|---|---|---|---|---|---|---|---|---|
| 24726768 | 2018-01-16 | Sony | 70 | 0 | Comcast | 10.75 | 0 | 1 |
| 25001464 | 2018-01-18 | Sony | 32 | 0 | 2.75 | 0 | 0 | |
| 28291998 | 2018-01-18 | Sony | 50 | 1 | Dish Network | 20.00 | 0 | 0 |
| 17057157 | 2018-01-19 | Sony | 32 | 0 | Comcast | 1.50 | 0 | 0 |
| 29504447 | 2018-01-17 | Sony | 32 | 0 | Comcast | 17.50 | 0 | 0 |
| 15529477 | 2018-01-18 | LG | 32 | 0 | Cox | 14.75 | 0 | 1 |
dim(test_data)
## [1] 418026 9
names(test_data)
## [1] "viewer_id" "date" "tv_make"
## [4] "tv_size" "uhd_capable" "tv_provider"
## [7] "total_time_watched" "watched" "test"
The viewer_data information.
################################
# viewer_data data information #
################################
head(viewer_data)
| viewer_id | gender | age | city |
|---|---|---|---|
| 1918165 | Female | 39 | Dallas |
| 27662619 | Female | 28 | New York |
| 5493662 | Female | 53 | Detroit |
| 14441247 | Male | 41 | New York |
| 25595927 | Male | 53 | Seattle |
| 16827252 | Male | 37 | New York |
dim(viewer_data)
## [1] 417464 4
names(viewer_data)
## [1] "viewer_id" "gender" "age" "city"
Attribute information test_data
viewer_id: the ID of the viewer
date: the date the viewer saw a commercial for “US Politics This Week”
tv_make: the make (i.e., brand) of TV
tv_size: the size of the TV in inches (approximately measured as the diagonal of the screen)
uhd_capable: whether the TV is (1) or is not (0) capable of displaying Ultra-High-Definition television content
tv_provider: the cable or satellite TV provider
total_time_watched: the total amount of TV watched (in hours) on the day in the date column
watched: whether the viewer watched (1) “US Politics This Week” or not (0)
test: viewers are split into test (1) and control (0) groups; test viewers saw the new commercial with their local Mayor while control viewers saw the old commercial with the Mayor of Los Angeles
Attribute information test_data
viewer_id: the ID of the viewer (same ID as in the test_data.csv file)
gender: the viewer’s gender
age: the viewer’s age
city: the viewer’s city
First, we will perform a cleaning process in the database, in order to carry out an exploration of the Data, to better understand the variables.
####################
# Features Summary #
####################
Hmisc::describe(test_data)
## test_data
##
## 9 Variables 418026 Observations
## --------------------------------------------------------------------------------
## viewer_id
## n missing distinct Info Mean Gmd .05 .10
## 418026 0 418020 1 18355853 14390488 45765 81659
## .25 .50 .75 .90 .95
## 7526723 18383268 29163528 35670978 37848442
##
## lowest : 10000 10002 10003 10005 10006
## highest: 39999726 39999736 39999781 39999808 39999921
## --------------------------------------------------------------------------------
## date
## n missing distinct
## 418026 0 5
##
## Value 2018-01-15 2018-01-16 2018-01-17 2018-01-18 2018-01-19
## Frequency 86641 82989 83145 82544 82707
## Proportion 0.207 0.199 0.199 0.197 0.198
## --------------------------------------------------------------------------------
## tv_make
## n missing distinct
## 418026 0 4
##
## Value LG Philips Sony Toshiba
## Frequency 41769 41836 271922 62499
## Proportion 0.10 0.10 0.65 0.15
## --------------------------------------------------------------------------------
## tv_size
## n missing distinct Info Mean Gmd
## 418026 0 8 0.984 51.87 13.97
##
## Value 32 40 43 50 55 60 65 70
## Frequency 52286 52388 52190 52086 52254 52240 52305 52277
## Proportion 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## --------------------------------------------------------------------------------
## uhd_capable
## n missing distinct Info Sum Mean Gmd
## 418026 0 2 0.482 83964 0.2009 0.321
##
## --------------------------------------------------------------------------------
## tv_provider
## n missing distinct
## 418026 0 6
##
## Value Comcast Cox
## Frequency 52720 109796 72809
## Proportion 0.126 0.263 0.174
##
## Value DirecTV Dish Network Time Warner Cable
## Frequency 36738 36249 109714
## Proportion 0.088 0.087 0.262
## --------------------------------------------------------------------------------
## total_time_watched
## n missing distinct Info Mean Gmd .05 .10
## 418026 0 95 1 10.04 7.074 1.25 2.00
## .25 .50 .75 .90 .95
## 5.00 9.50 14.50 19.25 21.25
##
## lowest : 0.25 0.50 0.75 1.00 1.25, highest: 22.75 23.00 23.25 23.50 23.75
## --------------------------------------------------------------------------------
## watched
## n missing distinct Info Sum Mean Gmd
## 418026 0 2 0.155 22802 0.05455 0.1031
##
## --------------------------------------------------------------------------------
## test
## n missing distinct Info Sum Mean Gmd
## 418026 0 2 0.75 204327 0.4888 0.4997
##
## --------------------------------------------------------------------------------
#############################
# Graphic for Missing Data #
#############################
missmap(test_data)
####################
# Features Summary #
####################
Hmisc::describe(viewer_data)
## viewer_data
##
## 4 Variables 417464 Observations
## --------------------------------------------------------------------------------
## viewer_id
## n missing distinct Info Mean Gmd .05 .10
## 417464 0 417464 1 18379082 14381963 45723 81560
## .25 .50 .75 .90 .95
## 7571130 18415881 29179002 35677581 37850556
##
## lowest : 10000 10002 10003 10005 10006
## highest: 39999726 39999736 39999781 39999808 39999921
## --------------------------------------------------------------------------------
## gender
## n missing distinct
## 417464 0 2
##
## Value Female Male
## Frequency 209416 208048
## Proportion 0.502 0.498
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 417464 0 55 0.999 40.47 13.62 24 26
## .25 .50 .75 .90 .95
## 31 39 48 57 63
##
## lowest : 18 19 20 21 22, highest: 68 71 72 73 74
## --------------------------------------------------------------------------------
## city
## n missing distinct
## 417464 0 15
##
## Atlanta (24326, 0.058), Boston (24418, 0.058), Chicago (33043, 0.079), Dallas
## (26681, 0.064), Detroit (17931, 0.043), Houston (24391, 0.058), Los Angeles
## (52513, 0.126), Miami (17285, 0.041), Minneapolis (17295, 0.041), New York
## (69893, 0.167), Philadelphia (28756, 0.069), Phoenix (19332, 0.046), San
## Francisco (24300, 0.058), Seattle (18696, 0.045), Tampa (18604, 0.045)
## --------------------------------------------------------------------------------
#############################
# Graphic for Missing Data #
#############################
missmap(viewer_data)
As We can see, there is no missing in both data sets. Let´s now merge the two data sets.
######################################
# Join the two Data Set by Viewer ID #
######################################
full_data <- merge(test_data, viewer_data,
by = "viewer_id") %>%
na.omit()
#########################
# Structure of the Data #
#########################
head(full_data)
| viewer_id | date | tv_make | tv_size | uhd_capable | tv_provider | total_time_watched | watched | test | gender | age | city |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 10000 | 2018-01-15 | Sony | 55 | 0 | Time Warner Cable | 5.25 | 0 | 0 | Female | 31 | Los Angeles |
| 10002 | 2018-01-17 | Sony | 60 | 1 | Time Warner Cable | 23.25 | 0 | 0 | Male | 41 | Los Angeles |
| 10003 | 2018-01-19 | LG | 70 | 1 | Cox | 9.25 | 0 | 0 | Female | 29 | Los Angeles |
| 10005 | 2018-01-15 | Sony | 43 | 0 | Comcast | 2.75 | 0 | 0 | Male | 53 | Los Angeles |
| 10006 | 2018-01-16 | LG | 55 | 0 | Time Warner Cable | 20.00 | 1 | 0 | Female | 58 | Los Angeles |
| 10007 | 2018-01-16 | Sony | 70 | 0 | Time Warner Cable | 16.50 | 0 | 0 | Male | 38 | Los Angeles |
####################
# Features Summary #
####################
Hmisc::describe(full_data %>%
select(tv_make, tv_provider, total_time_watched,
gender, age, city))
## full_data %>% select(tv_make, tv_provider, total_time_watched, gender, age, city)
##
## 6 Variables 417470 Observations
## --------------------------------------------------------------------------------
## tv_make
## n missing distinct
## 417470 0 4
##
## Value LG Philips Sony Toshiba
## Frequency 41706 41774 271574 62416
## Proportion 0.100 0.100 0.651 0.150
## --------------------------------------------------------------------------------
## tv_provider
## n missing distinct
## 417470 0 6
##
## Value Comcast Cox
## Frequency 52644 109641 72700
## Proportion 0.126 0.263 0.174
##
## Value DirecTV Dish Network Time Warner Cable
## Frequency 36701 36213 109571
## Proportion 0.088 0.087 0.262
## --------------------------------------------------------------------------------
## total_time_watched
## n missing distinct Info Mean Gmd .05 .10
## 417470 0 95 1 10.04 7.075 1.25 2.00
## .25 .50 .75 .90 .95
## 5.00 9.50 14.50 19.25 21.25
##
## lowest : 0.25 0.50 0.75 1.00 1.25, highest: 22.75 23.00 23.25 23.50 23.75
## --------------------------------------------------------------------------------
## gender
## n missing distinct
## 417470 0 2
##
## Value Female Male
## Frequency 209420 208050
## Proportion 0.502 0.498
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 417470 0 55 0.999 40.47 13.62 24 26
## .25 .50 .75 .90 .95
## 31 39 48 57 63
##
## lowest : 18 19 20 21 22, highest: 68 71 72 73 74
## --------------------------------------------------------------------------------
## city
## n missing distinct
## 417470 0 15
##
## Atlanta (24327, 0.058), Boston (24418, 0.058), Chicago (33045, 0.079), Dallas
## (26681, 0.064), Detroit (17931, 0.043), Houston (24391, 0.058), Los Angeles
## (52513, 0.126), Miami (17285, 0.041), Minneapolis (17296, 0.041), New York
## (69894, 0.167), Philadelphia (28756, 0.069), Phoenix (19332, 0.046), San
## Francisco (24300, 0.058), Seattle (18696, 0.045), Tampa (18605, 0.045)
## --------------------------------------------------------------------------------
Let´s convert the date feature. Because this features is a Date feature, but in the original data set it is like Factor.
####################
# Convert the Date #
####################
full_data$date <- as.Date(full_data$date, "%Y-%m-%d")
str(full_data)
## 'data.frame': 417470 obs. of 12 variables:
## $ viewer_id : int 10000 10002 10003 10005 10006 10007 10012 10014 10015 10016 ...
## $ date : Date, format: "2018-01-15" "2018-01-17" ...
## $ tv_make : Factor w/ 4 levels "LG","Philips",..: 3 3 1 3 1 3 3 3 1 2 ...
## $ tv_size : int 55 60 70 43 55 70 65 40 50 70 ...
## $ uhd_capable : int 0 1 1 0 0 0 0 0 0 0 ...
## $ tv_provider : Factor w/ 6 levels "","Comcast","Cox",..: 6 6 3 2 6 6 2 3 5 1 ...
## $ total_time_watched: num 5.25 23.25 9.25 2.75 20 ...
## $ watched : int 0 0 0 0 1 0 0 0 0 0 ...
## $ test : int 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 1 2 1 2 2 1 1 2 ...
## $ age : int 31 41 29 53 58 38 62 38 42 54 ...
## $ city : Factor w/ 15 levels "Atlanta","Boston",..: 7 7 7 7 7 7 7 7 7 7 ...
Let´s reproduce the first result found by Channel KLMN’s data scientist. I mean, let´s figure it out if there is a large fraction of people who saw the commercial for US Politics This Week and watched the show are from Los Angeles, while a much lower fraction of people from other cities who saw the commercial watched the show
#####################
# Grouping by City #
#####################
full_data_frac <- full_data %>%
select(watched, city) %>%
group_by(city) %>%
summarise(Total = sum(watched)) %>%
mutate(Proportion = Total/sum(Total)) %>%
arrange(-Total)
full_data_frac
| city | Total | Proportion |
|---|---|---|
| Los Angeles | 5412 | 0.2373476 |
| New York | 3599 | 0.1578370 |
| Chicago | 1672 | 0.0733269 |
| Dallas | 1315 | 0.0576704 |
| Houston | 1268 | 0.0556092 |
| Boston | 1267 | 0.0555653 |
| Atlanta | 1230 | 0.0539426 |
| San Francisco | 1210 | 0.0530655 |
| Phoenix | 1016 | 0.0445575 |
| Tampa | 958 | 0.0420139 |
| Minneapolis | 913 | 0.0400403 |
| Detroit | 899 | 0.0394264 |
| Miami | 881 | 0.0386370 |
| Philadelphia | 675 | 0.0296027 |
| Seattle | 487 | 0.0213578 |
###########
# Graphic #
###########
plot_full_data_frac <- full_data_frac %>%
ggplot(aes(x = city,
y = Proportion,
fill = city)) +
my_theme() +
geom_col(position = "dodge") +
scale_y_continuous(limits = c(0, 0.3))
ggplotly(plot_full_data_frac)
As We can see, the fraction of people who saw the commercial for US Politics This Week and watched the show from Los Angeles is \(23,7\%\) is the highest fraction, the second city is from New York which fraction is \(15,8\%\). I mean, the fraction from of people who saw the commercial from Los Angeles is \(50\%\) higher than New York viewers.
Now, Let´s Reproduce the negative result found by Channel KLMN’s data scientist, i.e, let´s investigate if the commercials with local Mayors really driving a lower fraction of people to watch the show.
#############################
# Grouping by City and Test #
#############################
full_data_test <- full_data %>%
select(watched, city, test) %>%
group_by(city, test) %>%
summarise(Total_Watched = sum(watched)) %>%
mutate(Proportion_Watched = Total_Watched/sum(Total_Watched)) %>%
arrange(-Total_Watched)
full_data_test
| city | test | Total_Watched | Proportion_Watched |
|---|---|---|---|
| Los Angeles | 0 | 5412 | 1.0000000 |
| New York | 0 | 1808 | 0.5023618 |
| New York | 1 | 1791 | 0.4976382 |
| Chicago | 1 | 878 | 0.5251196 |
| Chicago | 0 | 794 | 0.4748804 |
| Boston | 1 | 662 | 0.5224941 |
| Houston | 1 | 662 | 0.5220820 |
| Dallas | 1 | 660 | 0.5019011 |
| Dallas | 0 | 655 | 0.4980989 |
| Atlanta | 1 | 641 | 0.5211382 |
| San Francisco | 1 | 626 | 0.5173554 |
| Houston | 0 | 606 | 0.4779180 |
| Boston | 0 | 605 | 0.4775059 |
| Atlanta | 0 | 589 | 0.4788618 |
| San Francisco | 0 | 584 | 0.4826446 |
| Philadelphia | 1 | 575 | 0.8518519 |
| Phoenix | 1 | 516 | 0.5078740 |
| Tampa | 1 | 501 | 0.5229645 |
| Phoenix | 0 | 500 | 0.4921260 |
| Detroit | 1 | 489 | 0.5439377 |
| Minneapolis | 1 | 477 | 0.5224535 |
| Miami | 1 | 469 | 0.5323496 |
| Tampa | 0 | 457 | 0.4770355 |
| Minneapolis | 0 | 436 | 0.4775465 |
| Miami | 0 | 412 | 0.4676504 |
| Detroit | 0 | 410 | 0.4560623 |
| Seattle | 1 | 407 | 0.8357290 |
| Philadelphia | 0 | 100 | 0.1481481 |
| Seattle | 0 | 80 | 0.1642710 |
###########
# Graphic #
###########
plot_full_data_test <- full_data_test %>%
ggplot(aes(x = city,
y = Proportion_Watched)) +
my_theme() +
geom_col(position = "dodge") +
scale_fill_manual(values=mycolors)+
facet_grid(test~.)
ggplotly(plot_full_data_test)
As We can see, almost all the cities, has almost the same proportion of viewers who saw the new commercial with their local Mayor and the viewers who saw the old commercial with the Mayor of Los Angeles. Only two cities (Philadelphia and Seattle) has such difference between these two groups.
So, this Plots show us that the commercials with local Mayors are not driving a lower fraction of people to watch the show in almost the cities.
Let´s grouping by type test (0 or 1) in order to verify if there is a statistical significance between these two groups. Our Hypotheses will be:
\(H0:\) The proportion of group 0 is equal \(50/%\)
\(Ha:\) The proportion of group 0 is less than \(50/%\)
I mean, if the \(Ha\) is true, this means that the fraction of viewers who saw the old commercial with the Mayor of Los Angeles is less than the fraction of viewers saw the new commercial with their local Mayor. For this test We have to exclude the Los Angeles city from the Data Set, because the proportion of this group is, obvious equal to 1, because the experiment is about the Major from Los Angeles.
##################################################
# Grouping by City and Test without Los Angeles #
##################################################
full_data_hyp <- subset(full_data, city!="Los Angeles") %>%
select(watched, city, test) %>%
group_by(test) %>%
summarise(Total = sum(watched)) %>%
mutate(Proportion_Watched = Total/sum(Total)) %>%
arrange(-Total)
full_data_hyp
| test | Total | Proportion_Watched |
|---|---|---|
| 1 | 9354 | 0.5378953 |
| 0 | 8036 | 0.4621047 |
###########
# Graphic #
###########
plot_full_data_hyp <- full_data_hyp %>%
ggplot(aes(x = as.factor(test),
y = Proportion_Watched,
fill = as.factor(test))) +
my_theme() +
geom_col(position = "dodge") +
scale_fill_manual(values=mycolors)
ggplotly(plot_full_data_hyp)
##########
# Test Z #
##########
prop.test(full_data_hyp$Total[2], sum(full_data_hyp$Total), p = 0.5,
alternative = "less",
correct = TRUE)
##
## 1-sample proportions test with continuity correction
##
## data: full_data_hyp$Total[2] out of sum(full_data_hyp$Total), null probability 0.5
## X-squared = 99.741, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is less than 0.5
## 95 percent confidence interval:
## 0.0000000 0.4683575
## sample estimates:
## p
## 0.4621047
As We can see, the fraction of viewers saw the new commercial with their local Mayor while is really less than the old commercial with the Mayor of Los Angeles.
City x test
full_data$test <- as.factor(full_data$test)
a1 <- full_data %>% ggplot(aes(x=city,fill=test))+
geom_bar(position="fill",color="black",alpha=0.8,show.legend = T)+
scale_fill_manual(values=mycolors)+
coord_flip()+
ggtitle("city")
a1
Watched x test
a2 <- full_data %>% ggplot(aes(x=watched,fill=test))+
geom_bar(position="fill",color="black",alpha=0.8,show.legend = T)+
scale_fill_manual(values=mycolors)+
coord_flip()+
ggtitle("watched")
a2
uhd_capable x test
a3 <- full_data %>% ggplot(aes(x=uhd_capable,fill=test))+
geom_bar(position="fill",color="black",alpha=0.8,show.legend = T)+
scale_fill_manual(values=mycolors)+
coord_flip()+
ggtitle("uhd_capable")
a3
tv_provider x test
a4 <- full_data %>% ggplot(aes(x=tv_provider,fill=test))+
geom_bar(position="fill",color="black",alpha=0.8,show.legend = T)+
scale_fill_manual(values=mycolors)+
coord_flip()+
ggtitle("tv_provider")
a4
Gender x test
a5 <- full_data %>% ggplot(aes(x=gender,fill=test))+
geom_bar(position="fill",color="black",alpha=0.8,show.legend = T)+
scale_fill_manual(values=mycolors)+
coord_flip()+
ggtitle("gender")
a5
Gender x city
a6 <- full_data %>% ggplot(aes(x=gender,fill=city))+
geom_bar(position="fill",color="black",alpha=0.8,show.legend = T)+
scale_fill_manual(values=mycolors)+
coord_flip()+
ggtitle("city")
a6
total_time_watched x test
full_data %>% gather(total_time_watched,
key="Features",value="Value") %>%
ggplot(aes(x=test,y=Value,fill=test))+
geom_boxplot(alpha=0.8,color="black")+
coord_flip()+facet_wrap(~Features,ncol=1,scales="free")+
scale_fill_manual(values=mycolors)
total_time_watched x city
full_data %>% gather(total_time_watched,
key="Features",value="Value") %>%
ggplot(aes(x=city,y=Value,fill=city))+
geom_boxplot(alpha=0.8,color="black")+
coord_flip()+facet_wrap(~Features,ncol=1,scales="free")+
scale_fill_manual(values=mycolors)
With this Pivot Table We can manipulate and see the distribution for whatever variables combination. It´s totally interactive Pivot Table. We can drag the variables to the columns or rows, choose the categories, make filters, choose which metrics or graphs.
########################
# Import the Libraries #
########################
library(rpivotTable)
library(data.table)
rpivotTable(setDT(full_data),
cols= "age",
rows = "city",
rendererName = "Heatmap")