Reading data & summary

#Reading data
data = read.csv("Project DataSet.csv")
#view(head(data,50))

# filter only the page types "advice","profiles","jobs","NA","companies","tags","coaching","coaches","user"
data <- data %>%
  filter(data$page %in% c("advice","profiles","jobs","NA","companies","tags","coaching","coaches","user"))
data = data[-c(6, 8)]
sapply(data, function(x) sum(is.na(x)))
##              date                uv          sessions        page_views 
##                 0                 0                 0                 0 
##              page           channel         job_views  job_apply_clicks 
##                 0                 0                 0                 0 
## job_apply_success 
##                 0
summary(data)
##       date              uv              sessions       
##  7/31/19:  9517   Min.   :    1.00   Min.   :    1.00  
##  6/4/19 :  9331   1st Qu.:    1.00   1st Qu.:    1.00  
##  7/23/19:  9209   Median :    3.00   Median :    3.00  
##  6/3/19 :  9069   Mean   :   22.73   Mean   :   23.72  
##  7/24/19:  9059   3rd Qu.:    8.00   3rd Qu.:    9.00  
##  7/30/19:  8982   Max.   :25103.00   Max.   :27557.00  
##  (Other):418314                                        
##    page_views               page                    channel      
##  Min.   :     1.00   advice   :403435   SEO             :243420  
##  1st Qu.:     2.00   jobs     : 37439   Direct          :124193  
##  Median :     6.00   profiles : 27221   Referral        : 35868  
##  Mean   :    42.42   companies:  3010   Email           : 17235  
##  3rd Qu.:    17.00   tags     :   840   Linkedin Organic: 14270  
##  Max.   :278491.00   coaching :   715   Brandblock      : 12851  
##                      (Other)  :   821   (Other)         : 25644  
##    job_views       job_apply_clicks   job_apply_success
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.0000  
##  Median : 0.0000   Median : 0.00000   Median : 0.0000  
##  Mean   : 0.2693   Mean   : 0.03869   Mean   : 0.0277  
##  3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.: 0.0000  
##  Max.   :69.0000   Max.   :25.00000   Max.   :47.0000  
## 
str(data)
## 'data.frame':    473481 obs. of  9 variables:
##  $ date             : Factor w/ 61 levels "6/1/19","6/10/19",..: 59 49 48 48 28 9 14 55 10 28 ...
##  $ uv               : int  9733 955 56 381 82 413 747 65 25103 37 ...
##  $ sessions         : int  10799 973 56 402 82 417 760 65 27557 37 ...
##  $ page_views       : int  28660 1017 63 475 94 497 820 102 70215 51 ...
##  $ page             : Factor w/ 225 levels "...","...how-to-get-over-vacation-guilt-and-actually-enjoy-your-...",..: 37 37 37 37 37 37 37 37 37 37 ...
##  $ channel          : Factor w/ 16 levels "Brandblock","Direct",..: 13 13 13 13 13 13 13 13 13 13 ...
##  $ job_views        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ job_apply_clicks : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ job_apply_success: int  0 0 0 0 0 0 0 0 0 0 ...
# Changing Page field to factor
data$page = as.factor(data$page)

capFirst <- function(s) {
    paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "")
}


data$page <- capFirst(data$page)
data$channel <- capFirst(data$channel)
#Summary of data
str(data)
## 'data.frame':    473481 obs. of  9 variables:
##  $ date             : Factor w/ 61 levels "6/1/19","6/10/19",..: 59 49 48 48 28 9 14 55 10 28 ...
##  $ uv               : int  9733 955 56 381 82 413 747 65 25103 37 ...
##  $ sessions         : int  10799 973 56 402 82 417 760 65 27557 37 ...
##  $ page_views       : int  28660 1017 63 475 94 497 820 102 70215 51 ...
##  $ page             : chr  "Advice" "Advice" "Advice" "Advice" ...
##  $ channel          : chr  "SEO" "SEO" "SEO" "SEO" ...
##  $ job_views        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ job_apply_clicks : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ job_apply_success: int  0 0 0 0 0 0 0 0 0 0 ...
summary(data)
##       date              uv              sessions       
##  7/31/19:  9517   Min.   :    1.00   Min.   :    1.00  
##  6/4/19 :  9331   1st Qu.:    1.00   1st Qu.:    1.00  
##  7/23/19:  9209   Median :    3.00   Median :    3.00  
##  6/3/19 :  9069   Mean   :   22.73   Mean   :   23.72  
##  7/24/19:  9059   3rd Qu.:    8.00   3rd Qu.:    9.00  
##  7/30/19:  8982   Max.   :25103.00   Max.   :27557.00  
##  (Other):418314                                        
##    page_views            page             channel         
##  Min.   :     1.00   Length:473481      Length:473481     
##  1st Qu.:     2.00   Class :character   Class :character  
##  Median :     6.00   Mode  :character   Mode  :character  
##  Mean   :    42.42                                        
##  3rd Qu.:    17.00                                        
##  Max.   :278491.00                                        
##                                                           
##    job_views       job_apply_clicks   job_apply_success
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.0000  
##  Median : 0.0000   Median : 0.00000   Median : 0.0000  
##  Mean   : 0.2693   Mean   : 0.03869   Mean   : 0.0277  
##  3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.: 0.0000  
##  Max.   :69.0000   Max.   :25.00000   Max.   :47.0000  
## 
glimpse(data)
## Observations: 473,481
## Variables: 9
## $ date              <fct> 7/7/19, 7/26/19, 7/25/19, 7/25/19, 6/7/19, 6/1…
## $ uv                <int> 9733, 955, 56, 381, 82, 413, 747, 65, 25103, 3…
## $ sessions          <int> 10799, 973, 56, 402, 82, 417, 760, 65, 27557, …
## $ page_views        <int> 28660, 1017, 63, 475, 94, 497, 820, 102, 70215…
## $ page              <chr> "Advice", "Advice", "Advice", "Advice", "Advic…
## $ channel           <chr> "SEO", "SEO", "SEO", "SEO", "SEO", "SEO", "SEO…
## $ job_views         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ job_apply_clicks  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ job_apply_success <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Data Cleaning

#converting date into date format
data$date = as.Date(as.character(data$date), format = "%m/%d/%y")
#data$date = as.Date(data$date)
head(data$date)
## [1] "2019-07-07" "2019-07-26" "2019-07-25" "2019-07-25" "2019-06-07"
## [6] "2019-06-17"

The company_name is left as a factor for efficiency. It can be converted into character as needed.
The columns: uv, sessions, page_views, job_views, job_apply_clicks, job_apply_success are already in “int” format. But if we were to convert all of them into numeric at once, we can follow the below steps:

sapply(data, function(x) sum(is.na(x)))
##              date                uv          sessions        page_views 
##                 0                 0                 0                 0 
##              page           channel         job_views  job_apply_clicks 
##                 0                 0                 0                 0 
## job_apply_success 
##                 0

#Sessions by Page

options(scipen = 999)

data%>%
  group_by(page) %>%
  summarise(sessions = sum(sessions))%>%
  ggplot(aes(reorder(page,-sessions),sessions, fill = page))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(sessions,big.mark = ','),vjust=0))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Sessions by Page",
       x = "Page",
       y= "Sessions",
       fill = "Page") + theme_economist()+
  theme(plot.title = element_text(hjust = 0.5) ) + scale_fill_economist()

#Sessions by Channel

options(scipen = 999)

data%>%
  group_by(channel) %>%
  summarise(sessions = sum(sessions))%>%
  ggplot(aes(reorder(channel,sessions),sessions, fill = channel))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(sessions,big.mark = ','),hjust=0.35))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Sessions by Channel",
       x = "Channel",
       y= "Sessions",
       fill = "Channel") + theme_gdocs()+
  theme(plot.title = element_text(hjust = 0.5) ) + 
  scale_fill_discrete() + coord_flip()

#Sessions by day per channel

options(scipen = 999)

head(data)
# Sessions by Day per channel
data%>%
  group_by(date,channel) %>%
  summarise(sessions = sum(sessions))%>%
  ggplot(aes(x =as.Date(date), y =sessions, color = channel))+
  geom_line() + facet_wrap(~channel, scales = "free")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Sessions by Day",
       x = "Date",
       y= "Sessions",
       fill = "Channel") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

# Sessions by Day
data%>%
  group_by(date) %>%
  summarise(sessions = sum(sessions))%>%
  ggplot(aes(x =as.Date(date), y =sessions))+
  geom_line( color = "steel blue")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Sessions by Day",
       x = "Date",
       y= "Sessions") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

#Uv’s by Channel

options(scipen = 999)

data%>%
  group_by(channel) %>%
  summarise(UV = sum(uv))%>%
  ggplot(aes(reorder(channel,UV),UV, fill = channel))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(UV,big.mark = ','),vjust=0))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Unique Visitors by Channel",
       x = "Channel",
       y= "Unique Visitors",
       fill = "Channel") + theme_gdocs()+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "None") + 
  scale_fill_discrete() + coord_flip()

#Uv’s by Page

options(scipen = 999)

data%>%
  group_by(page) %>%
  summarise(UV = sum(uv))%>%
  ggplot(aes(reorder(page,-UV),UV, fill = page))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(UV,big.mark = ','),vjust=0, hjust = -0.00001))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Unique Visitors by Page",
       x = "Page",
       y= "Unique Visitors",
       fill = "Page") + theme_economist()+
  theme(plot.title = element_text(hjust = 0.5) ) + scale_fill_economist()

#Uv’s by day per channel

options(scipen = 999)

# Uv's by Day per channel
data%>%
  group_by(date,channel) %>%
  summarise(UV = sum(uv))%>%
  ggplot(aes(x =as.Date(date), y =UV, color = channel))+
  geom_line() + facet_wrap(~channel, scales = "free")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Unique Visitors by Day",
       x = "Date",
       y= "Unique Visitors",
       fill = "Channel") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

# Uv's by Day
data%>%
  group_by(date) %>%
  summarise(UV = sum(uv))%>%
  ggplot(aes(x =as.Date(date), y =UV))+
  geom_line( color = "steel blue")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Unique Visitors by Day",
       x = "Date",
       y= "Unique Visitors") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

#Page Views by Channel

options(scipen = 999)

data%>%
  group_by(channel) %>%
  summarise(page_views = sum(page_views))%>%
  ggplot(aes(reorder(channel,page_views),page_views, fill = channel))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(page_views,big.mark = ','),hjust=0.43))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Page Views by Channel",
       x = "Channel",
       y= "Page Views",
       fill = "Channel") + theme_gdocs()+
  theme(plot.title = element_text(hjust = 0.5) ) + 
  scale_fill_discrete() + coord_flip()

#Page Views by day per Channel

options(scipen = 999)


# Page Views by Day
data%>%
  group_by(date) %>%
  summarise(page_views = sum(page_views))%>%
  ggplot(aes(x =as.Date(date), y =page_views))+
  geom_line( color = "steel blue")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Page Views by Day",
       x = "Date",
       y= "Page Views") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

#Page Views by day per channel
data%>%
  group_by(date,channel) %>%
  summarise(page_views = sum(page_views))%>%
  ggplot(aes(x =as.Date(date), y =page_views, color = channel))+
  geom_line() + facet_wrap(~channel, scales = "free")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Page Views by Day",
       x = "Date",
       y= "Page Views",
       fill = "Channel") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

#Job Views by Channel

options(scipen = 999)
data%>%
  group_by(channel) %>%
  summarise(job_views = sum(job_views))%>%
  ggplot(aes(reorder(channel,job_views),job_views, fill = channel))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(job_views,big.mark = ','),hjust=0.43))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Job Views by Channel",
       x = "Channel",
       y= "Job Views",
       fill = "Channel") + theme_gdocs()+
  theme(plot.title = element_text(hjust = 0.5) ) + 
  scale_fill_discrete() + coord_flip()

# Job Views by Page
data%>%
  group_by(page) %>%
  summarise(job_views = sum(job_views))%>%
  ggplot(aes(reorder(page,-job_views),job_views, fill = page))+
  geom_bar(stat = "identity", width = 0.3) +
  geom_text(aes(label = format(job_views,big.mark = ','),vjust=0))+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Job Views by Page",
       x = "Page",
       y= "Job Views",
       fill = "Page") + theme_economist()+
  theme(plot.title = element_text(hjust = 0.5) ) + scale_fill_economist()

#Job Views by day per Channel

options(scipen = 999)
# Page Views by Day
data%>%
  group_by(date) %>%
  summarise(job_views = sum(job_views))%>%
  ggplot(aes(x =as.Date(date), y =job_views))+
  geom_line( color = "steel blue")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Job Views by Day",
       x = "Date",
       y= "Job Views") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

#Page Views by day per channel
data%>%
  group_by(date,channel) %>%
  summarise(job_views = sum(job_views))%>%
  ggplot(aes(x =as.Date(date), y =job_views, color = channel))+
  geom_line() + facet_wrap(~channel, scales = "free")+
  geom_smooth(method = "lm")+
scale_y_continuous(labels = scales::comma)+
  labs(title = "Job Views by Day",
       x = "Date",
       y= "Job Views",
       fill = "Channel") + theme_solarized()+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="#993333", 
                                   size=8, angle=30)) + 
  scale_fill_discrete()
## `geom_smooth()` using formula 'y ~ x'

SCATTERPLOTS

# proj_data <- read_csv("Project DataSet.csv")
# #View(proj_data)

proj_dat_cleaned <- data

cleanup = theme(panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.background = element_blank(),
                axis.line.x = element_line(color = 'black'),
                axis.line.y = element_line(color = 'black'),
                legend.key = element_rect(fill = 'white'),
                text = element_text(size = 15))

options(scipen = 999)

par(mfrow=c(2,2))


# Session and Unique vistors
ggplot(proj_dat_cleaned, aes(uv, sessions)) + geom_point(aes(alpha = 0.2, size=2)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") + 
    labs(title = "Scatterplot of Unique visitors and Session") + 
    labs(x = "Unique Visitors", y = "Sessions") +
    theme_economist() + scale_colour_economist()+
    cleanup
## `geom_smooth()` using formula 'y ~ x'

# uv by Page views grouped by Page type
ggplot(proj_dat_cleaned, aes(uv, page_views)) + geom_point(aes(alpha = 0.2, size=2, colour = page)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red", aes(fill = page)) + 
    labs(title = "Scatterplot of Unique Visitor and page views by Page ") + 
    labs(x = "Unique Visitors", y = "Page Views")+
    cleanup
## `geom_smooth()` using formula 'y ~ x'

# uv by Page views grouped by channel
ggplot(proj_dat_cleaned, aes(uv, page_views)) + geom_point(aes(alpha = 0.2, size=2, colour = channel)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red") + 
    labs(title = "Unique Visitors & page views by channel  ") + 
    labs(x = "Unique Visitors", y = "Page Views")+
    cleanup
## `geom_smooth()` using formula 'y ~ x'

# uv by Job views
# Adding channel/page is not useful
ggplot(proj_dat_cleaned, aes(uv, job_views)) + geom_point(aes(alpha = 0.2, size=2)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red")  + 
    labs(title = "Scatterplot of Unique visitors and Job views") + 
    labs(x = "Unique Visitors", y = "Job Views")+
    cleanup
## `geom_smooth()` using formula 'y ~ x'

# Job view and Job apply click by channel
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_views, proj_dat_cleaned$job_apply_clicks)) + geom_point(aes(colour = channel)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red")  + 
    labs(title = "Job views and Job apply click by Channel") + 
    labs(x = "Job Views", y = "Job apply Click")+
    cleanup
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## `geom_smooth()` using formula 'y ~ x'

# Job view and Job apply click by Page type
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_views, proj_dat_cleaned$job_apply_clicks)) + geom_point(aes(colour = page)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red")  + 
    labs(title = "Job views and Job apply click by Page") + 
    labs(x = "Job Views", y = "Job apply Click")+
    cleanup
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.

## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## `geom_smooth()` using formula 'y ~ x'

#  Job apply click and job application success by channel
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_apply_clicks, proj_dat_cleaned$job_apply_success)) + geom_point(aes(colour = channel)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red")  + 
    labs(title = "Job apply click & Job Apply success ") + 
    labs(x = "Job Apply Click", y = "Job Apply Success")+
    cleanup
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## `geom_smooth()` using formula 'y ~ x'

#  Job apply click and job application success by Page type
ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_apply_clicks, proj_dat_cleaned$job_apply_success)) + geom_point(aes(colour = page)) + 
    guides(alpha=F,size=F) + stat_smooth(method = "lm", level = 0.95, colour = "red")  + 
    labs(title = "Job Apply clicks & Job apply Success by Page") + 
    labs(x = "Job Apply click", y = "Job Apply Success")+
    scale_fill_discrete() +
    cleanup
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.

## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.
## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## `geom_smooth()` using formula 'y ~ x'

ggplot(proj_dat_cleaned, aes(proj_dat_cleaned$job_apply_clicks, proj_dat_cleaned$job_apply_success, fill=proj_dat_cleaned$page)) + 
geom_boxplot()
## Warning: Use of `proj_dat_cleaned$job_apply_clicks` is discouraged. Use
## `job_apply_clicks` instead.

## Warning: Use of `proj_dat_cleaned$job_apply_success` is discouraged. Use
## `job_apply_success` instead.
## Warning: Use of `proj_dat_cleaned$page` is discouraged. Use `page` instead.

#boxplot(proj_dat_cleaned$job_apply_clicks ~ proj_dat_cleaned$company_name, proj_dat_cleaned, xlab = "Company", ylab = "Job Apply Click")

ggplot(proj_dat_cleaned, aes(x=proj_dat_cleaned$page, y= proj_dat_cleaned$job_views)) + 
  geom_boxplot(outlier.colour="blue", outlier.size=4)
## Warning: Use of `proj_dat_cleaned$page` is discouraged. Use `page` instead.
## Warning: Use of `proj_dat_cleaned$job_views` is discouraged. Use
## `job_views` instead.

CORRELATION

# Out of the whole dataset, only these variables will be considered.

variables = c(
    'uv', 
    'sessions',
    'page_views',
    'job_views',
    'job_apply_clicks',
    'job_apply_success')

data2 <- data[, variables]
summary(data2)
##        uv              sessions          page_views       
##  Min.   :    1.00   Min.   :    1.00   Min.   :     1.00  
##  1st Qu.:    1.00   1st Qu.:    1.00   1st Qu.:     2.00  
##  Median :    3.00   Median :    3.00   Median :     6.00  
##  Mean   :   22.73   Mean   :   23.72   Mean   :    42.42  
##  3rd Qu.:    8.00   3rd Qu.:    9.00   3rd Qu.:    17.00  
##  Max.   :25103.00   Max.   :27557.00   Max.   :278491.00  
##    job_views       job_apply_clicks   job_apply_success
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.0000  
##  Median : 0.0000   Median : 0.00000   Median : 0.0000  
##  Mean   : 0.2693   Mean   : 0.03869   Mean   : 0.0277  
##  3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.: 0.0000  
##  Max.   :69.0000   Max.   :25.00000   Max.   :47.0000
summary(data)
##       date                  uv              sessions       
##  Min.   :2019-06-01   Min.   :    1.00   Min.   :    1.00  
##  1st Qu.:2019-06-16   1st Qu.:    1.00   1st Qu.:    1.00  
##  Median :2019-07-01   Median :    3.00   Median :    3.00  
##  Mean   :2019-07-01   Mean   :   22.73   Mean   :   23.72  
##  3rd Qu.:2019-07-17   3rd Qu.:    8.00   3rd Qu.:    9.00  
##  Max.   :2019-07-31   Max.   :25103.00   Max.   :27557.00  
##    page_views            page             channel         
##  Min.   :     1.00   Length:473481      Length:473481     
##  1st Qu.:     2.00   Class :character   Class :character  
##  Median :     6.00   Mode  :character   Mode  :character  
##  Mean   :    42.42                                        
##  3rd Qu.:    17.00                                        
##  Max.   :278491.00                                        
##    job_views       job_apply_clicks   job_apply_success
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.0000  
##  Median : 0.0000   Median : 0.00000   Median : 0.0000  
##  Mean   : 0.2693   Mean   : 0.03869   Mean   : 0.0277  
##  3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.: 0.0000  
##  Max.   :69.0000   Max.   :25.00000   Max.   :47.0000
#Pearson method for a correlation table
round(cor(data2, use="pairwise.complete.obs", method = "pearson"), 2)
##                      uv sessions page_views job_views job_apply_clicks
## uv                 1.00     1.00       0.70     -0.02            -0.01
## sessions           1.00     1.00       0.70     -0.02            -0.01
## page_views         0.70     0.70       1.00     -0.01             0.00
## job_views         -0.02    -0.02      -0.01      1.00             0.30
## job_apply_clicks  -0.01    -0.01       0.00      0.30             1.00
## job_apply_success -0.01    -0.01       0.00      0.04             0.11
##                   job_apply_success
## uv                            -0.01
## sessions                      -0.01
## page_views                     0.00
## job_views                      0.04
## job_apply_clicks               0.11
## job_apply_success              1.00
cor.test(data2$job_views,data2$uv, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$job_views and data2$uv
## t = -16.16, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02632471 -0.02063110
## sample estimates:
##         cor 
## -0.02347809
cor.test(data2$job_views,data2$sessions, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$job_views and data2$sessions
## t = -15.831, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02584677 -0.02015304
## sample estimates:
##         cor 
## -0.02300009
cor.test(data2$job_views,data2$page_views, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$job_views and data2$page_views
## t = -5.0372, df = 473479, p-value = 0.0000004726
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.010168415 -0.004471972
## sample estimates:
##          cor 
## -0.007320253
cor.test(data2$job_views,data2$job_apply_clicks, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$job_views and data2$job_apply_clicks
## t = 216.71, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2978043 0.3029870
## sample estimates:
##       cor 
## 0.3003979
cor.test(data2$job_views,data2$job_apply_success, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  data2$job_views and data2$job_apply_success
## t = 29.396, df = 473479, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03983768 0.04552405
## sample estimates:
##        cor 
## 0.04268121
#Create Correltation plots for the dataset-
library(corrplot)
## corrplot 0.84 loaded
#cordata only created for corrplot
cordata = data2
#colnames(cordata) = c("job_views", "uv", "sessions", "page_views", "job_apply_clicks", "job_apply_success")
corrplot(cor(cordata), method="circle")

LINEAR MODEL

Model1  = lm(job_views~uv,data = data)
summary(Model1)
## 
## Call:
## lm(formula = job_views ~ uv, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.271 -0.271 -0.271 -0.268 68.729 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)  0.270856335  0.001219002  222.19 <0.0000000000000002 ***
## uv          -0.000069067  0.000004274  -16.16 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8361 on 473479 degrees of freedom
## Multiple R-squared:  0.0005512,  Adjusted R-squared:  0.0005491 
## F-statistic: 261.1 on 1 and 473479 DF,  p-value: < 0.00000000000000022
summary(Model1)$adj.r.squared
## [1] 0.00054911
Model3 <- lm(job_views~uv+ page_views,data = data)
summary(Model3)
## 
## Call:
## lm(formula = job_views ~ uv + page_views, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.569 -0.271 -0.271 -0.268 68.728 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)  0.271065513  0.001219136  222.34 <0.0000000000000002 ***
## uv          -0.000105903  0.000005986  -17.69 <0.0000000000000002 ***
## page_views   0.000014806  0.000001684    8.79 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8361 on 473478 degrees of freedom
## Multiple R-squared:  0.0007143,  Adjusted R-squared:  0.0007101 
## F-statistic: 169.2 on 2 and 473478 DF,  p-value: < 0.00000000000000022
summary(Model3)$adj.r.squared
## [1] 0.0007100619
Model4 <- lm(job_views~uv+ page_views+ job_apply_clicks,data = data)
summary(Model4)
## 
## Call:
## lm(formula = job_views ~ uv + page_views + job_apply_clicks, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.607  -0.235  -0.234  -0.232  68.764 
## 
## Coefficients:
##                      Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)       0.234659330  0.001174985 199.713 <0.0000000000000002 ***
## uv               -0.000092970  0.000005710 -16.283 <0.0000000000000002 ***
## page_views        0.000013418  0.000001607   8.351 <0.0000000000000002 ***
## job_apply_clicks  0.934894531  0.004316780 216.572 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7975 on 473477 degrees of freedom
## Multiple R-squared:  0.09078,    Adjusted R-squared:  0.09078 
## F-statistic: 1.576e+04 on 3 and 473477 DF,  p-value: < 0.00000000000000022
summary(Model4)$adj.r.squared
## [1] 0.09077727
Model5  = lm(job_views~ uv+ page_views + job_apply_clicks+ job_apply_success,data = data)
summary(Model5)
## 
## Call:
## lm(formula = job_views ~ uv + page_views + job_apply_clicks + 
##     job_apply_success, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.528  -0.234  -0.234  -0.231  68.765 
## 
## Coefficients:
##                       Estimate   Std. Error t value             Pr(>|t|)
## (Intercept)        0.233959164  0.001179894 198.288 < 0.0000000000000002
## uv                -0.000092751  0.000005710 -16.245 < 0.0000000000000002
## page_views         0.000013398  0.000001607   8.339 < 0.0000000000000002
## job_apply_clicks   0.931748250  0.004343826 214.499 < 0.0000000000000002
## job_apply_success  0.029520116  0.004556603   6.479      0.0000000000927
##                      
## (Intercept)       ***
## uv                ***
## page_views        ***
## job_apply_clicks  ***
## job_apply_success ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7975 on 473476 degrees of freedom
## Multiple R-squared:  0.09086,    Adjusted R-squared:  0.09086 
## F-statistic: 1.183e+04 on 4 and 473476 DF,  p-value: < 0.00000000000000022
summary(Model5)$adj.r.squared
## [1] 0.09085594
#To test for Multicollinearity
summary(Model5, correlation = T)
## 
## Call:
## lm(formula = job_views ~ uv + page_views + job_apply_clicks + 
##     job_apply_success, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.528  -0.234  -0.234  -0.231  68.765 
## 
## Coefficients:
##                       Estimate   Std. Error t value             Pr(>|t|)
## (Intercept)        0.233959164  0.001179894 198.288 < 0.0000000000000002
## uv                -0.000092751  0.000005710 -16.245 < 0.0000000000000002
## page_views         0.000013398  0.000001607   8.339 < 0.0000000000000002
## job_apply_clicks   0.931748250  0.004343826 214.499 < 0.0000000000000002
## job_apply_success  0.029520116  0.004556603   6.479      0.0000000000927
##                      
## (Intercept)       ***
## uv                ***
## page_views        ***
## job_apply_clicks  ***
## job_apply_success ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7975 on 473476 degrees of freedom
## Multiple R-squared:  0.09086,    Adjusted R-squared:  0.09086 
## F-statistic: 1.183e+04 on 4 and 473476 DF,  p-value: < 0.00000000000000022
## 
## Correlation of Coefficients:
##                   (Intercept) uv    page_views job_apply_clicks
## uv                -0.07                                        
## page_views         0.02       -0.70                            
## job_apply_clicks  -0.13        0.01  0.00                      
## job_apply_success -0.09        0.01  0.00      -0.11
# Individual pair-wise correlation is an indicator of multicollinearity. The correlation coeff of the  variables shows that each variables are not highly correlated as the value is not close to 1.
# Also multicollinearity exist when there is very high R squared ie more than 0.90 and coefficients are not significant according to their p-values. In our model R squared is 0.09, F statistic is significant 9468 and   p value is less than 0.05. The individual p value for most of the variables is significant. Hence there is no multicollinearity

#Comparing the models
anova(Model1, Model3, Model4, Model5) 
#Looking at the adj R squared Model4 provides an increase in the adj R squard value and is significant. Model 4 is the best model

CLUSTERING

library(readr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ tibble  2.1.3     ✓ purrr   0.3.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ tibble  2.1.3     ✓ forcats 0.4.0
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x Hmisc::src()       masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
proj_data <- read_csv("Project DataSet.csv")
## Parsed with column specification:
## cols(
##   date = col_character(),
##   uv = col_double(),
##   sessions = col_double(),
##   page_views = col_double(),
##   page = col_character(),
##   article_name = col_character(),
##   channel = col_character(),
##   company_name = col_character(),
##   job_views = col_double(),
##   job_apply_clicks = col_double(),
##   job_apply_success = col_double()
## )
#View(proj_data)

### /* ********************** Start of pre-analysis ******************************* */

proj_data %>% count(gsub('[%]*[0-9]*',"",page),sort = TRUE)
proj_data %>% count(channel,sort = TRUE)
proj_data %>% count(company_name,sort = TRUE)
proj_data %>% count(job_views,sort = TRUE)
proj_data %>% count(job_apply_clicks,sort = TRUE)
proj_data %>% count(job_apply_success,sort = TRUE)
channl_job_suc <- proj_data %>% count(channel,job_apply_success,sort = TRUE)
#SEO job_apply_success
channl_job_suc[channl_job_suc$n < 12,]
rcorr(as.matrix(proj_data[sapply(proj_data,is.numeric)]),type = "pearson")
##                      uv sessions page_views job_views job_apply_clicks
## uv                 1.00     1.00       0.67     -0.01            -0.01
## sessions           1.00     1.00       0.67     -0.01            -0.01
## page_views         0.67     0.67       1.00      0.00             0.00
## job_views         -0.01    -0.01       0.00      1.00             0.10
## job_apply_clicks  -0.01    -0.01       0.00      0.10             1.00
## job_apply_success -0.01    -0.01       0.00      0.01             0.12
##                   job_apply_success
## uv                            -0.01
## sessions                      -0.01
## page_views                     0.00
## job_views                      0.01
## job_apply_clicks               0.12
## job_apply_success              1.00
## 
## n= 500000 
## 
## 
## P
##                   uv     sessions page_views job_views job_apply_clicks
## uv                       0.0000   0.0000     0.0000    0.0000          
## sessions          0.0000          0.0000     0.0000    0.0000          
## page_views        0.0000 0.0000              0.6467    0.0006          
## job_views         0.0000 0.0000   0.6467               0.0000          
## job_apply_clicks  0.0000 0.0000   0.0006     0.0000                    
## job_apply_success 0.0000 0.0000   0.0055     0.0000    0.0000          
##                   job_apply_success
## uv                0.0000           
## sessions          0.0000           
## page_views        0.0055           
## job_views         0.0000           
## job_apply_clicks  0.0000           
## job_apply_success
#boxplot(proj_data)
#cor(proj_data[sapply(proj_data,is.numeric)])
#is.numeric(proj_data[])

sapply(proj_data,function(x) sum(is.na(x)))
##              date                uv          sessions        page_views 
##                 0                 0                 0                 0 
##              page      article_name           channel      company_name 
##             17810             98111                 0            387475 
##         job_views  job_apply_clicks job_apply_success 
##                 0                 0                 0
#head(proj_data[is.na(proj_data$page),])
#proj_data$page[is.na(proj_data$page),]

#######################  Cleaned dataset    ####################

proj_dat_cleaned <- proj_data[c(1:5,7,9:11)]
page_type <- c("advice","profiles","jobs","NA","companies","tags","coaching","coaches","user")
proj_dat_cleaned <- (proj_dat_cleaned[proj_dat_cleaned$page %in% page_type,])
#proj_dat_cleaned$weekday_page <- paste(weekdays(proj_dat_cleaned$date, abbreviate = TRUE),"-",proj_dat_cleaned$page)
#proj_dat_cleaned$weekday_channel <- paste(weekdays(proj_dat_cleaned$date, abbreviate = TRUE),"-",proj_dat_cleaned$channel)
#proj_dat_cln_grp -> proj_dat_cleaned %>% group_by(weekday_page)
#boxplot(proj_dat_cleaned[2:4])

###     /* ********************** End of pre-analysis ******************************* */


# /* *********************** Start of k-means clustering (By Page and Channel on unique visitors and job views) ******************* */

# Treat Outliers for unique visitors before scaling 
# Unique visitors below 1.5*IQR to be replaced by 5 percentile and unique visitors above 1.5*IQR to be replaced by 95 percentile
qnt <- quantile(proj_dat_cleaned$uv, probs=c(.25, .75), na.rm = T)
caps <- quantile(proj_dat_cleaned$uv, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(proj_dat_cleaned$uv, na.rm = T)
proj_dat_cleaned$uv[proj_dat_cleaned$uv < (qnt[1] - H)] <- caps[1]
proj_dat_cleaned$uv[proj_dat_cleaned$uv > (qnt[2] + H)] <- caps[2]

# Group data set on page and summarize on unique visitors and job views on mean
proj_dat_cln_grp_pg <- as.data.frame(proj_dat_cleaned %>%
  group_by(page) %>%
  summarise_at(vars(uv,job_views), funs(mean(., na.rm=TRUE))))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
# Group data set on channel and summarize on unique visitors and job views on mean
proj_dat_cln_grp_chnl <- as.data.frame(proj_dat_cleaned %>%
  group_by(channel) %>%
  summarise_at(vars(uv,job_views), funs(mean(., na.rm=TRUE))))

# Make page and channel as rownames in respective dataset
rownames(proj_dat_cln_grp_pg) <- proj_dat_cln_grp_pg$page
rownames(proj_dat_cln_grp_chnl) <- proj_dat_cln_grp_chnl$channel
proj_dat_cln_grp_chnl <- proj_dat_cln_grp_chnl[2:3]
proj_dat_cln_grp_pg <- proj_dat_cln_grp_pg[2:3]

# scale dataset to standardize values for unique visitors and job views
k_clus_proj_dat <- scale(proj_dat_cln_grp_pg)
k_clus_proj_dat_chnl <- scale(proj_dat_cln_grp_chnl)

#distance <- get_dist(k_clus_proj_dat)
#fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
#distance


#   Clusters for dataset grouped by page

set.seed(080620)

wss <- function(k) {
  kmeans(proj_dat_cln_grp_pg, k, nstart = 25 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 7
k.values <- 1:7
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

k3 <- kmeans(proj_dat_cln_grp_pg, centers = 3, nstart = 25)
k3
## K-means clustering with 3 clusters of sizes 3, 3, 2
## 
## Cluster means:
##          uv job_views
## 1 11.791744 0.4967346
## 2  4.365135 1.3937331
## 3 15.605682 0.5852273
## 
## Clustering vector:
##    advice   coaches  coaching companies      jobs  profiles      tags 
##         1         3         3         2         2         2         1 
##      user 
##         1 
## 
## Within cluster sum of squares by cluster:
## [1] 2.227108 1.918923 3.617195
##  (between_SS / total_SS =  95.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
fviz_cluster(k3, data = proj_dat_cln_grp_pg)

# Clusters for dataset grouped by channel 

set.seed(070620)

wss_chnl <- function(k) {
  kmeans(proj_dat_cln_grp_chnl, k, nstart = 25 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values_chnl <- 1:15

wss_values_chnl <- map_dbl(k.values_chnl, wss_chnl)

plot(k.values_chnl, wss_values_chnl,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

k3_chnl <- kmeans(proj_dat_cln_grp_chnl, centers = 3, nstart = 25)
k3_chnl
## K-means clustering with 3 clusters of sizes 2, 6, 8
## 
## Cluster means:
##          uv job_views
## 1 17.886018 0.5051136
## 2  6.987717 0.5033511
## 3  3.482218 0.2799655
## 
## Clustering vector:
##        Brandblock            Direct             Email  Facebook Organic 
##                 2                 2                 2                 2 
##     Facebook Paid       Google Jobs Instagram Organic  Linkedin Elevate 
##                 1                 3                 3                 3 
##  Linkedin Organic     Outbrain Paid Pinterest Organic          Referral 
##                 2                 3                 2                 3 
##               SEO     Social Others   Twitter Organic   Youtube Organic 
##                 1                 3                 3                 3 
## 
## Within cluster sum of squares by cluster:
## [1]  5.165711 10.114131 11.985114
##  (between_SS / total_SS =  92.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
fviz_cluster(k3_chnl, data = proj_dat_cln_grp_chnl)

# /********************* End of k-means clustering  *************************** */