#Reading data
data = read.csv("Project DataSet.csv")
#view(head(data,50))
# filter only the page types "advice","profiles","jobs","NA","companies","tags","coaching","coaches","user"
data <- data %>%
filter(data$page %in% c("advice","profiles","jobs","NA","companies","tags","coaching","coaches","user"))
data = data[-c(6, 8)]
sapply(data, function(x) sum(is.na(x)))
## date uv sessions page_views
## 0 0 0 0
## page channel job_views job_apply_clicks
## 0 0 0 0
## job_apply_success
## 0
summary(data)
## date uv sessions
## 7/31/19: 9517 Min. : 1.00 Min. : 1.00
## 6/4/19 : 9331 1st Qu.: 1.00 1st Qu.: 1.00
## 7/23/19: 9209 Median : 3.00 Median : 3.00
## 6/3/19 : 9069 Mean : 22.73 Mean : 23.72
## 7/24/19: 9059 3rd Qu.: 8.00 3rd Qu.: 9.00
## 7/30/19: 8982 Max. :25103.00 Max. :27557.00
## (Other):418314
## page_views page channel
## Min. : 1.00 advice :403435 SEO :243420
## 1st Qu.: 2.00 jobs : 37439 Direct :124193
## Median : 6.00 profiles : 27221 Referral : 35868
## Mean : 42.42 companies: 3010 Email : 17235
## 3rd Qu.: 17.00 tags : 840 Linkedin Organic: 14270
## Max. :278491.00 coaching : 715 Brandblock : 12851
## (Other) : 821 (Other) : 25644
## job_views job_apply_clicks job_apply_success
## Min. : 0.0000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.00000 Median : 0.0000
## Mean : 0.2693 Mean : 0.03869 Mean : 0.0277
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :69.0000 Max. :25.00000 Max. :47.0000
##
str(data)
## 'data.frame': 473481 obs. of 9 variables:
## $ date : Factor w/ 61 levels "6/1/19","6/10/19",..: 59 49 48 48 28 9 14 55 10 28 ...
## $ uv : int 9733 955 56 381 82 413 747 65 25103 37 ...
## $ sessions : int 10799 973 56 402 82 417 760 65 27557 37 ...
## $ page_views : int 28660 1017 63 475 94 497 820 102 70215 51 ...
## $ page : Factor w/ 225 levels "...","...how-to-get-over-vacation-guilt-and-actually-enjoy-your-...",..: 37 37 37 37 37 37 37 37 37 37 ...
## $ channel : Factor w/ 16 levels "Brandblock","Direct",..: 13 13 13 13 13 13 13 13 13 13 ...
## $ job_views : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_apply_clicks : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_apply_success: int 0 0 0 0 0 0 0 0 0 0 ...
# Changing Page field to factor
data$page = as.factor(data$page)
capFirst <- function(s) {
paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "")
}
data$page <- capFirst(data$page)
data$channel <- capFirst(data$channel)
#Summary of data
str(data)
## 'data.frame': 473481 obs. of 9 variables:
## $ date : Factor w/ 61 levels "6/1/19","6/10/19",..: 59 49 48 48 28 9 14 55 10 28 ...
## $ uv : int 9733 955 56 381 82 413 747 65 25103 37 ...
## $ sessions : int 10799 973 56 402 82 417 760 65 27557 37 ...
## $ page_views : int 28660 1017 63 475 94 497 820 102 70215 51 ...
## $ page : chr "Advice" "Advice" "Advice" "Advice" ...
## $ channel : chr "SEO" "SEO" "SEO" "SEO" ...
## $ job_views : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_apply_clicks : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_apply_success: int 0 0 0 0 0 0 0 0 0 0 ...
summary(data)
## date uv sessions
## 7/31/19: 9517 Min. : 1.00 Min. : 1.00
## 6/4/19 : 9331 1st Qu.: 1.00 1st Qu.: 1.00
## 7/23/19: 9209 Median : 3.00 Median : 3.00
## 6/3/19 : 9069 Mean : 22.73 Mean : 23.72
## 7/24/19: 9059 3rd Qu.: 8.00 3rd Qu.: 9.00
## 7/30/19: 8982 Max. :25103.00 Max. :27557.00
## (Other):418314
## page_views page channel
## Min. : 1.00 Length:473481 Length:473481
## 1st Qu.: 2.00 Class :character Class :character
## Median : 6.00 Mode :character Mode :character
## Mean : 42.42
## 3rd Qu.: 17.00
## Max. :278491.00
##
## job_views job_apply_clicks job_apply_success
## Min. : 0.0000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.00000 Median : 0.0000
## Mean : 0.2693 Mean : 0.03869 Mean : 0.0277
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :69.0000 Max. :25.00000 Max. :47.0000
##
glimpse(data)
## Observations: 473,481
## Variables: 9
## $ date <fct> 7/7/19, 7/26/19, 7/25/19, 7/25/19, 6/7/19, 6/1…
## $ uv <int> 9733, 955, 56, 381, 82, 413, 747, 65, 25103, 3…
## $ sessions <int> 10799, 973, 56, 402, 82, 417, 760, 65, 27557, …
## $ page_views <int> 28660, 1017, 63, 475, 94, 497, 820, 102, 70215…
## $ page <chr> "Advice", "Advice", "Advice", "Advice", "Advic…
## $ channel <chr> "SEO", "SEO", "SEO", "SEO", "SEO", "SEO", "SEO…
## $ job_views <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ job_apply_clicks <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ job_apply_success <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#converting date into date format
data$date = as.Date(as.character(data$date), format = "%m/%d/%y")
#data$date = as.Date(data$date)
head(data$date)
## [1] "2019-07-07" "2019-07-26" "2019-07-25" "2019-07-25" "2019-06-07"
## [6] "2019-06-17"
The company_name is left as a factor for efficiency. It can be converted into character as needed.
The columns: uv, sessions, page_views, job_views, job_apply_clicks, job_apply_success are already in “int” format. But if we were to convert all of them into numeric at once, we can follow the below steps:
sapply(data, function(x) sum(is.na(x)))
## date uv sessions page_views
## 0 0 0 0
## page channel job_views job_apply_clicks
## 0 0 0 0
## job_apply_success
## 0
#Sessions by Page
options(scipen = 999)
data%>%
group_by(page) %>%
summarise(sessions = sum(sessions))%>%
ggplot(aes(reorder(page,-sessions),sessions, fill = page))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(sessions,big.mark = ','),vjust=0))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Sessions by Page",
x = "Page",
y= "Sessions",
fill = "Page") + theme_economist()+
theme(plot.title = element_text(hjust = 0.5) ) + scale_fill_economist()
#Sessions by Channel
options(scipen = 999)
data%>%
group_by(channel) %>%
summarise(sessions = sum(sessions))%>%
ggplot(aes(reorder(channel,sessions),sessions, fill = channel))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(sessions,big.mark = ','),hjust=0.35))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Sessions by Channel",
x = "Channel",
y= "Sessions",
fill = "Channel") + theme_gdocs()+
theme(plot.title = element_text(hjust = 0.5) ) +
scale_fill_discrete() + coord_flip()
#Sessions by day per channel
options(scipen = 999)
head(data)
# Sessions by Day per channel
data%>%
group_by(date,channel) %>%
summarise(sessions = sum(sessions))%>%
ggplot(aes(x =as.Date(date), y =sessions, color = channel))+
geom_line() + facet_wrap(~channel, scales = "free")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Sessions by Day",
x = "Date",
y= "Sessions",
fill = "Channel") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
# Sessions by Day
data%>%
group_by(date) %>%
summarise(sessions = sum(sessions))%>%
ggplot(aes(x =as.Date(date), y =sessions))+
geom_line( color = "steel blue")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Sessions by Day",
x = "Date",
y= "Sessions") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
#Uv’s by Channel
options(scipen = 999)
data%>%
group_by(channel) %>%
summarise(UV = sum(uv))%>%
ggplot(aes(reorder(channel,UV),UV, fill = channel))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(UV,big.mark = ','),vjust=0))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Unique Visitors by Channel",
x = "Channel",
y= "Unique Visitors",
fill = "Channel") + theme_gdocs()+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "None") +
scale_fill_discrete() + coord_flip()
#Uv’s by Page
options(scipen = 999)
data%>%
group_by(page) %>%
summarise(UV = sum(uv))%>%
ggplot(aes(reorder(page,-UV),UV, fill = page))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(UV,big.mark = ','),vjust=0, hjust = -0.00001))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Unique Visitors by Page",
x = "Page",
y= "Unique Visitors",
fill = "Page") + theme_economist()+
theme(plot.title = element_text(hjust = 0.5) ) + scale_fill_economist()
#Uv’s by day per channel
options(scipen = 999)
# Uv's by Day per channel
data%>%
group_by(date,channel) %>%
summarise(UV = sum(uv))%>%
ggplot(aes(x =as.Date(date), y =UV, color = channel))+
geom_line() + facet_wrap(~channel, scales = "free")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Unique Visitors by Day",
x = "Date",
y= "Unique Visitors",
fill = "Channel") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
# Uv's by Day
data%>%
group_by(date) %>%
summarise(UV = sum(uv))%>%
ggplot(aes(x =as.Date(date), y =UV))+
geom_line( color = "steel blue")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Unique Visitors by Day",
x = "Date",
y= "Unique Visitors") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
#Page Views by Channel
options(scipen = 999)
data%>%
group_by(channel) %>%
summarise(page_views = sum(page_views))%>%
ggplot(aes(reorder(channel,page_views),page_views, fill = channel))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(page_views,big.mark = ','),hjust=0.43))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Page Views by Channel",
x = "Channel",
y= "Page Views",
fill = "Channel") + theme_gdocs()+
theme(plot.title = element_text(hjust = 0.5) ) +
scale_fill_discrete() + coord_flip()
#Page Views by day per Channel
options(scipen = 999)
# Page Views by Day
data%>%
group_by(date) %>%
summarise(page_views = sum(page_views))%>%
ggplot(aes(x =as.Date(date), y =page_views))+
geom_line( color = "steel blue")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Page Views by Day",
x = "Date",
y= "Page Views") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
#Page Views by day per channel
data%>%
group_by(date,channel) %>%
summarise(page_views = sum(page_views))%>%
ggplot(aes(x =as.Date(date), y =page_views, color = channel))+
geom_line() + facet_wrap(~channel, scales = "free")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Page Views by Day",
x = "Date",
y= "Page Views",
fill = "Channel") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
#Job Views by Channel
options(scipen = 999)
data%>%
group_by(channel) %>%
summarise(job_views = sum(job_views))%>%
ggplot(aes(reorder(channel,job_views),job_views, fill = channel))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(job_views,big.mark = ','),hjust=0.43))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Job Views by Channel",
x = "Channel",
y= "Job Views",
fill = "Channel") + theme_gdocs()+
theme(plot.title = element_text(hjust = 0.5) ) +
scale_fill_discrete() + coord_flip()
# Job Views by Page
data%>%
group_by(page) %>%
summarise(job_views = sum(job_views))%>%
ggplot(aes(reorder(page,-job_views),job_views, fill = page))+
geom_bar(stat = "identity", width = 0.3) +
geom_text(aes(label = format(job_views,big.mark = ','),vjust=0))+
scale_y_continuous(labels = scales::comma)+
labs(title = "Job Views by Page",
x = "Page",
y= "Job Views",
fill = "Page") + theme_economist()+
theme(plot.title = element_text(hjust = 0.5) ) + scale_fill_economist()
#Job Views by day per Channel
options(scipen = 999)
# Page Views by Day
data%>%
group_by(date) %>%
summarise(job_views = sum(job_views))%>%
ggplot(aes(x =as.Date(date), y =job_views))+
geom_line( color = "steel blue")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Job Views by Day",
x = "Date",
y= "Job Views") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
#Page Views by day per channel
data%>%
group_by(date,channel) %>%
summarise(job_views = sum(job_views))%>%
ggplot(aes(x =as.Date(date), y =job_views, color = channel))+
geom_line() + facet_wrap(~channel, scales = "free")+
geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
labs(title = "Job Views by Day",
x = "Date",
y= "Job Views",
fill = "Channel") + theme_solarized()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(face="bold", color="#993333",
size=8, angle=30)) +
scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'
# proj_data <- read_csv("Project DataSet.csv")
# #View(proj_data)
proj_dat_cleaned <- data
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'),
legend.key = element_rect(fill = 'white'),
text = element_text(size = 15))
options(scipen = 999)
par(mfrow=c(2,2))
# Session and Unique vistors
ggplot(proj_dat_cleaned, aes(uv, sessions)) + geom_point(aes(alpha = 0.2, size=2)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Scatterplot of Unique visitors and Session") +
labs(x = "Unique Visitors", y = "Sessions") +
theme_economist() + scale_colour_economist()+
cleanup
## `geom_smooth()` using formula 'y ~ x'
# uv by Page views grouped by Page type
ggplot(proj_dat_cleaned, aes(uv, page_views)) + geom_point(aes(alpha = 0.2, size=2, colour = page)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red", aes(fill = page)) +
labs(title = "Scatterplot of Unique Visitor and page views by Page ") +
labs(x = "Unique Visitors", y = "Page Views")+
cleanup
## `geom_smooth()` using formula 'y ~ x'
# uv by Page views grouped by channel
ggplot(proj_dat_cleaned, aes(uv, page_views)) + geom_point(aes(alpha = 0.2, size=2, colour = channel)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Unique Visitors & page views by channel ") +
labs(x = "Unique Visitors", y = "Page Views")+
cleanup
## `geom_smooth()` using formula 'y ~ x'
# uv by Job views
# Adding channel/page is not useful
ggplot(proj_dat_cleaned, aes(uv, job_views)) + geom_point(aes(alpha = 0.2, size=2)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Scatterplot of Unique visitors and Job views") +
labs(x = "Unique Visitors", y = "Job Views")+
cleanup
## `geom_smooth()` using formula 'y ~ x'
# Job view and Job apply click by channel
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_views, proj_dat_cleaned$job_apply_clicks)) + geom_point(aes(colour = channel)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Job views and Job apply click by Channel") +
labs(x = "Job Views", y = "Job apply Click")+
cleanup
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## `geom_smooth()` using formula 'y ~ x'
# Job view and Job apply click by Page type
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_views, proj_dat_cleaned$job_apply_clicks)) + geom_point(aes(colour = page)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Job views and Job apply click by Page") +
labs(x = "Job Views", y = "Job apply Click")+
cleanup
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## `geom_smooth()` using formula 'y ~ x'
# Job apply click and job application success by channel
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_apply_clicks, proj_dat_cleaned$job_apply_success)) + geom_point(aes(colour = channel)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Job apply click & Job Apply success ") +
labs(x = "Job Apply Click", y = "Job Apply Success")+
cleanup
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## `geom_smooth()` using formula 'y ~ x'
# Job apply click and job application success by Page type
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_apply_clicks, proj_dat_cleaned$job_apply_success)) + geom_point(aes(colour = page)) +
guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") +
labs(title = "Job Apply clicks & Job apply Success by Page") +
labs(x = "Job Apply click", y = "Job Apply Success")+
scale_fill_discrete() +
cleanup
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## `geom_smooth()` using formula 'y ~ x'
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_apply_clicks, proj_dat_cleaned$job_apply_success, fill=proj_dat_cleaned$page)) +
geom_boxplot()
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## Warning: Use of `proj_dat_cleaned$page` is discouraged. Use `page` instead.
#boxplot(proj_dat_cleaned$job_apply_clicks ~ proj_dat_cleaned$company_name, proj_dat_cleaned, xlab = "Company", ylab = "Job Apply Click")
ggplot(proj_dat_cleaned, aes(x=proj_dat_cleaned$page, y= proj_dat_cleaned$job_views)) +
geom_boxplot(outlier.colour="blue", outlier.size=4)
## Warning: Use of `proj_dat_cleaned$page` is discouraged. Use `page` instead.
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
# Out of the whole dataset, only these variables will be considered.
variables = c(
'uv',
'sessions',
'page_views',
'job_views',
'job_apply_clicks',
'job_apply_success')
data2 <- data[, variables]
summary(data2)
## uv sessions page_views
## Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.: 2.00
## Median : 3.00 Median : 3.00 Median : 6.00
## Mean : 22.73 Mean : 23.72 Mean : 42.42
## 3rd Qu.: 8.00 3rd Qu.: 9.00 3rd Qu.: 17.00
## Max. :25103.00 Max. :27557.00 Max. :278491.00
## job_views job_apply_clicks job_apply_success
## Min. : 0.0000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.00000 Median : 0.0000
## Mean : 0.2693 Mean : 0.03869 Mean : 0.0277
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :69.0000 Max. :25.00000 Max. :47.0000
summary(data)
## date uv sessions
## Min. :2019-06-01 Min. : 1.00 Min. : 1.00
## 1st Qu.:2019-06-16 1st Qu.: 1.00 1st Qu.: 1.00
## Median :2019-07-01 Median : 3.00 Median : 3.00
## Mean :2019-07-01 Mean : 22.73 Mean : 23.72
## 3rd Qu.:2019-07-17 3rd Qu.: 8.00 3rd Qu.: 9.00
## Max. :2019-07-31 Max. :25103.00 Max. :27557.00
## page_views page channel
## Min. : 1.00 Length:473481 Length:473481
## 1st Qu.: 2.00 Class :character Class :character
## Median : 6.00 Mode :character Mode :character
## Mean : 42.42
## 3rd Qu.: 17.00
## Max. :278491.00
## job_views job_apply_clicks job_apply_success
## Min. : 0.0000 Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.00000 Median : 0.0000
## Mean : 0.2693 Mean : 0.03869 Mean : 0.0277
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :69.0000 Max. :25.00000 Max. :47.0000
#Pearson method for a correlation table
round(cor(data2, use="pairwise.complete.obs", method = "pearson"), 2)
## uv sessions page_views job_views job_apply_clicks
## uv 1.00 1.00 0.70 -0.02 -0.01
## sessions 1.00 1.00 0.70 -0.02 -0.01
## page_views 0.70 0.70 1.00 -0.01 0.00
## job_views -0.02 -0.02 -0.01 1.00 0.30
## job_apply_clicks -0.01 -0.01 0.00 0.30 1.00
## job_apply_success -0.01 -0.01 0.00 0.04 0.11
## job_apply_success
## uv -0.01
## sessions -0.01
## page_views 0.00
## job_views 0.04
## job_apply_clicks 0.11
## job_apply_success 1.00
cor.test(data2$job_views,data2$uv, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$job_views and data2$uv
## t = -16.16, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02632471 -0.02063110
## sample estimates:
## cor
## -0.02347809
cor.test(data2$job_views,data2$sessions, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$job_views and data2$sessions
## t = -15.831, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02584677 -0.02015304
## sample estimates:
## cor
## -0.02300009
cor.test(data2$job_views,data2$page_views, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$job_views and data2$page_views
## t = -5.0372, df = 473479, p-value = 0.0000004726
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.010168415 -0.004471972
## sample estimates:
## cor
## -0.007320253
cor.test(data2$job_views,data2$job_apply_clicks, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$job_views and data2$job_apply_clicks
## t = 216.71, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2978043 0.3029870
## sample estimates:
## cor
## 0.3003979
cor.test(data2$job_views,data2$job_apply_success, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: data2$job_views and data2$job_apply_success
## t = 29.396, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03983768 0.04552405
## sample estimates:
## cor
## 0.04268121
#Create Correltation plots for the dataset-
library(corrplot)
## corrplot 0.84 loaded
#cordata only created for corrplot
cordata = data2
#colnames(cordata) = c("job_views", "uv", "sessions", "page_views", "job_apply_clicks", "job_apply_success")
corrplot(cor(cordata), method="circle")
Model1 = lm(job_views~uv,data = data)
summary(Model1)
##
## Call:
## lm(formula = job_views ~ uv, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.271 -0.271 -0.271 -0.268 68.729
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.270856335 0.001219002 222.19 <0.0000000000000002 ***
## uv -0.000069067 0.000004274 -16.16 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8361 on 473479 degrees of freedom
## Multiple R-squared: 0.0005512, Adjusted R-squared: 0.0005491
## F-statistic: 261.1 on 1 and 473479 DF, p-value: < 0.00000000000000022
summary(Model1)$adj.r.squared
## [1] 0.00054911
Model3 <- lm(job_views~uv+ page_views,data = data)
summary(Model3)
##
## Call:
## lm(formula = job_views ~ uv + page_views, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.569 -0.271 -0.271 -0.268 68.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.271065513 0.001219136 222.34 <0.0000000000000002 ***
## uv -0.000105903 0.000005986 -17.69 <0.0000000000000002 ***
## page_views 0.000014806 0.000001684 8.79 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8361 on 473478 degrees of freedom
## Multiple R-squared: 0.0007143, Adjusted R-squared: 0.0007101
## F-statistic: 169.2 on 2 and 473478 DF, p-value: < 0.00000000000000022
summary(Model3)$adj.r.squared
## [1] 0.0007100619
Model4 <- lm(job_views~uv+ page_views+ job_apply_clicks,data = data)
summary(Model4)
##
## Call:
## lm(formula = job_views ~ uv + page_views + job_apply_clicks,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.607 -0.235 -0.234 -0.232 68.764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.234659330 0.001174985 199.713 <0.0000000000000002 ***
## uv -0.000092970 0.000005710 -16.283 <0.0000000000000002 ***
## page_views 0.000013418 0.000001607 8.351 <0.0000000000000002 ***
## job_apply_clicks 0.934894531 0.004316780 216.572 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7975 on 473477 degrees of freedom
## Multiple R-squared: 0.09078, Adjusted R-squared: 0.09078
## F-statistic: 1.576e+04 on 3 and 473477 DF, p-value: < 0.00000000000000022
summary(Model4)$adj.r.squared
## [1] 0.09077727
Model5 = lm(job_views~ uv+ page_views + job_apply_clicks+ job_apply_success,data = data)
summary(Model5)
##
## Call:
## lm(formula = job_views ~ uv + page_views + job_apply_clicks +
## job_apply_success, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.528 -0.234 -0.234 -0.231 68.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.233959164 0.001179894 198.288 < 0.0000000000000002
## uv -0.000092751 0.000005710 -16.245 < 0.0000000000000002
## page_views 0.000013398 0.000001607 8.339 < 0.0000000000000002
## job_apply_clicks 0.931748250 0.004343826 214.499 < 0.0000000000000002
## job_apply_success 0.029520116 0.004556603 6.479 0.0000000000927
##
## (Intercept) ***
## uv ***
## page_views ***
## job_apply_clicks ***
## job_apply_success ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7975 on 473476 degrees of freedom
## Multiple R-squared: 0.09086, Adjusted R-squared: 0.09086
## F-statistic: 1.183e+04 on 4 and 473476 DF, p-value: < 0.00000000000000022
summary(Model5)$adj.r.squared
## [1] 0.09085594
#To test for Multicollinearity
summary(Model5, correlation = T)
##
## Call:
## lm(formula = job_views ~ uv + page_views + job_apply_clicks +
## job_apply_success, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.528 -0.234 -0.234 -0.231 68.765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.233959164 0.001179894 198.288 < 0.0000000000000002
## uv -0.000092751 0.000005710 -16.245 < 0.0000000000000002
## page_views 0.000013398 0.000001607 8.339 < 0.0000000000000002
## job_apply_clicks 0.931748250 0.004343826 214.499 < 0.0000000000000002
## job_apply_success 0.029520116 0.004556603 6.479 0.0000000000927
##
## (Intercept) ***
## uv ***
## page_views ***
## job_apply_clicks ***
## job_apply_success ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7975 on 473476 degrees of freedom
## Multiple R-squared: 0.09086, Adjusted R-squared: 0.09086
## F-statistic: 1.183e+04 on 4 and 473476 DF, p-value: < 0.00000000000000022
##
## Correlation of Coefficients:
## (Intercept) uv page_views job_apply_clicks
## uv -0.07
## page_views 0.02 -0.70
## job_apply_clicks -0.13 0.01 0.00
## job_apply_success -0.09 0.01 0.00 -0.11
# Individual pair-wise correlation is an indicator of multicollinearity. The correlation coeff of the variables shows that each variables are not highly correlated as the value is not close to 1.
# Also multicollinearity exist when there is very high R squared ie more than 0.90 and coefficients are not significant according to their p-values. In our model R squared is 0.09, F statistic is significant 9468 and p value is less than 0.05. The individual p value for most of the variables is significant. Hence there is no multicollinearity
#Comparing the models
anova(Model1, Model3, Model4, Model5)
#Looking at the adj R squared Model4 provides an increase in the adj R squard value and is significant. Model 4 is the best model
library(readr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ tibble 2.1.3 ✓ purrr 0.3.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ tibble 2.1.3 ✓ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x Hmisc::src() masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
proj_data <- read_csv("Project DataSet.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## uv = col_double(),
## sessions = col_double(),
## page_views = col_double(),
## page = col_character(),
## article_name = col_character(),
## channel = col_character(),
## company_name = col_character(),
## job_views = col_double(),
## job_apply_clicks = col_double(),
## job_apply_success = col_double()
## )
#View(proj_data)
### /* ********************** Start of pre-analysis ******************************* */
proj_data %>% count(gsub('[%]*[0-9]*',"",page),sort = TRUE)
proj_data %>% count(channel,sort = TRUE)
proj_data %>% count(company_name,sort = TRUE)
proj_data %>% count(job_views,sort = TRUE)
proj_data %>% count(job_apply_clicks,sort = TRUE)
proj_data %>% count(job_apply_success,sort = TRUE)
channl_job_suc <- proj_data %>% count(channel,job_apply_success,sort = TRUE)
#SEO job_apply_success
channl_job_suc[channl_job_suc$n < 12,]
rcorr(as.matrix(proj_data[sapply(proj_data,is.numeric)]),type = "pearson")
## uv sessions page_views job_views job_apply_clicks
## uv 1.00 1.00 0.67 -0.01 -0.01
## sessions 1.00 1.00 0.67 -0.01 -0.01
## page_views 0.67 0.67 1.00 0.00 0.00
## job_views -0.01 -0.01 0.00 1.00 0.10
## job_apply_clicks -0.01 -0.01 0.00 0.10 1.00
## job_apply_success -0.01 -0.01 0.00 0.01 0.12
## job_apply_success
## uv -0.01
## sessions -0.01
## page_views 0.00
## job_views 0.01
## job_apply_clicks 0.12
## job_apply_success 1.00
##
## n= 500000
##
##
## P
## uv sessions page_views job_views job_apply_clicks
## uv 0.0000 0.0000 0.0000 0.0000
## sessions 0.0000 0.0000 0.0000 0.0000
## page_views 0.0000 0.0000 0.6467 0.0006
## job_views 0.0000 0.0000 0.6467 0.0000
## job_apply_clicks 0.0000 0.0000 0.0006 0.0000
## job_apply_success 0.0000 0.0000 0.0055 0.0000 0.0000
## job_apply_success
## uv 0.0000
## sessions 0.0000
## page_views 0.0055
## job_views 0.0000
## job_apply_clicks 0.0000
## job_apply_success
#boxplot(proj_data)
#cor(proj_data[sapply(proj_data,is.numeric)])
#is.numeric(proj_data[])
sapply(proj_data,function(x) sum(is.na(x)))
## date uv sessions page_views
## 0 0 0 0
## page article_name channel company_name
## 17810 98111 0 387475
## job_views job_apply_clicks job_apply_success
## 0 0 0
#head(proj_data[is.na(proj_data$page),])
#proj_data$page[is.na(proj_data$page),]
####################### Cleaned dataset ####################
proj_dat_cleaned <- proj_data[c(1:5,7,9:11)]
page_type <- c("advice","profiles","jobs","NA","companies","tags","coaching","coaches","user")
proj_dat_cleaned <- (proj_dat_cleaned[proj_dat_cleaned$page %in% page_type,])
#proj_dat_cleaned$weekday_page <- paste(weekdays(proj_dat_cleaned$date, abbreviate = TRUE),"-",proj_dat_cleaned$page)
#proj_dat_cleaned$weekday_channel <- paste(weekdays(proj_dat_cleaned$date, abbreviate = TRUE),"-",proj_dat_cleaned$channel)
#proj_dat_cln_grp -> proj_dat_cleaned %>% group_by(weekday_page)
#boxplot(proj_dat_cleaned[2:4])
### /* ********************** End of pre-analysis ******************************* */
# /* *********************** Start of k-means clustering (By Page and Channel on unique visitors and job views) ******************* */
# Treat Outliers for unique visitors before scaling
# Unique visitors below 1.5*IQR to be replaced by 5 percentile and unique visitors above 1.5*IQR to be replaced by 95 percentile
qnt <- quantile(proj_dat_cleaned$uv, probs=c(.25, .75), na.rm = T)
caps <- quantile(proj_dat_cleaned$uv, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(proj_dat_cleaned$uv, na.rm = T)
proj_dat_cleaned$uv[proj_dat_cleaned$uv < (qnt[1] - H)] <- caps[1]
proj_dat_cleaned$uv[proj_dat_cleaned$uv > (qnt[2] + H)] <- caps[2]
# Group data set on page and summarize on unique visitors and job views on mean
proj_dat_cln_grp_pg <- as.data.frame(proj_dat_cleaned %>%
group_by(page) %>%
summarise_at(vars(uv,job_views), funs(mean(., na.rm=TRUE))))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
# Group data set on channel and summarize on unique visitors and job views on mean
proj_dat_cln_grp_chnl <- as.data.frame(proj_dat_cleaned %>%
group_by(channel) %>%
summarise_at(vars(uv,job_views), funs(mean(., na.rm=TRUE))))
# Make page and channel as rownames in respective dataset
rownames(proj_dat_cln_grp_pg) <- proj_dat_cln_grp_pg$page
rownames(proj_dat_cln_grp_chnl) <- proj_dat_cln_grp_chnl$channel
proj_dat_cln_grp_chnl <- proj_dat_cln_grp_chnl[2:3]
proj_dat_cln_grp_pg <- proj_dat_cln_grp_pg[2:3]
# scale dataset to standardize values for unique visitors and job views
k_clus_proj_dat <- scale(proj_dat_cln_grp_pg)
k_clus_proj_dat_chnl <- scale(proj_dat_cln_grp_chnl)
#distance <- get_dist(k_clus_proj_dat)
#fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
#distance
# Clusters for dataset grouped by page
set.seed(080620)
wss <- function(k) {
kmeans(proj_dat_cln_grp_pg, k, nstart = 25 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 7
k.values <- 1:7
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
k3 <- kmeans(proj_dat_cln_grp_pg, centers = 3, nstart = 25)
k3
## K-means clustering with 3 clusters of sizes 3, 3, 2
##
## Cluster means:
## uv job_views
## 1 11.791744 0.4967346
## 2 4.365135 1.3937331
## 3 15.605682 0.5852273
##
## Clustering vector:
## advice coaches coaching companies jobs profiles tags
## 1 3 3 2 2 2 1
## user
## 1
##
## Within cluster sum of squares by cluster:
## [1] 2.227108 1.918923 3.617195
## (between_SS / total_SS = 95.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fviz_cluster(k3, data = proj_dat_cln_grp_pg)
# Clusters for dataset grouped by channel
set.seed(070620)
wss_chnl <- function(k) {
kmeans(proj_dat_cln_grp_chnl, k, nstart = 25 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values_chnl <- 1:15
wss_values_chnl <- map_dbl(k.values_chnl, wss_chnl)
plot(k.values_chnl, wss_values_chnl,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
k3_chnl <- kmeans(proj_dat_cln_grp_chnl, centers = 3, nstart = 25)
k3_chnl
## K-means clustering with 3 clusters of sizes 2, 6, 8
##
## Cluster means:
## uv job_views
## 1 17.886018 0.5051136
## 2 6.987717 0.5033511
## 3 3.482218 0.2799655
##
## Clustering vector:
## Brandblock Direct Email Facebook Organic
## 2 2 2 2
## Facebook Paid Google Jobs Instagram Organic Linkedin Elevate
## 1 3 3 3
## Linkedin Organic Outbrain Paid Pinterest Organic Referral
## 2 3 2 3
## SEO Social Others Twitter Organic Youtube Organic
## 1 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 5.165711 10.114131 11.985114
## (between_SS / total_SS = 92.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fviz_cluster(k3_chnl, data = proj_dat_cln_grp_chnl)
# /********************* End of k-means clustering *************************** */