Capstone Project Initial Analysis

Read

toptrend.df=read.csv(paste("USvideos.csv",sep=""))
attach(toptrend.df)

View(toptrend.df)
str(toptrend.df)

## 'data.frame':    7800 obs. of  16 variables:
##  $ video_id              : Factor w/ 1909 levels "-1yT-K3c6YI",..: 141 121 226 1335 569 767 166 1176 954 1577 ...
##  $ trending_date         : Factor w/ 39 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ title                 : Factor w/ 1927 levels "'I have taken poison' claims war criminal",..: 1809 1677 1342 1231 788 42 1391 78 1615 1882 ...
##  $ channel_title         : Factor w/ 992 levels "12 News","1theK (ì›ë”ì¼€ì´)",..: 143 491 738 339 643 399 748 207 3 956 ...
##  $ category_id           : int  22 24 23 24 24 28 24 28 1 25 ...
##  $ publish_time          : Factor w/ 1875 levels "2008-06-17T00:07:56.000Z",..: 260 229 213 233 211 265 198 216 239 237 ...
##  $ tags                  : Factor w/ 1823 levels "[none]","08282016NtflxUSCAN|Black Mirror|Netflix|Netflix Original Series|San Junipero|Charlie Brooker|Arkangel|USS Calli"| __truncated__,..: 1388 905 1271 1320 1341 708 1430 20 1673 1743 ...
##  $ views                 : int  748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
##  $ likes                 : int  57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
##  $ dislikes              : int  2966 6146 5339 666 1989 511 2445 778 119 1363 ...
##  $ comment_count         : int  15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
##  $ thumbnail_link        : Factor w/ 1909 levels "https://i.ytimg.com/vi/-1yT-K3c6YI/default.jpg",..: 141 121 226 1335 569 767 166 1176 954 1577 ...
##  $ comments_disabled     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ratings_disabled      : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ video_error_or_removed: Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ description           : Factor w/ 1970 levels "","'A curious cat helps his owner with home improvements.'\\nWe're releasing a NEW BLACK & WHITE episode every wee"| __truncated__,..: 1380 1209 1827 1754 717 1794 432 747 817 499 ...

summary(toptrend.df[,c(2,5,8,9,10,11,13,14,15)])

##   trending_date   category_id        views               likes        
##  17.01.12: 200   Min.   : 1.00   Min.   :      687   Min.   :      0  
##  17.02.12: 200   1st Qu.:17.00   1st Qu.:    84184   1st Qu.:   2018  
##  17.03.12: 200   Median :24.00   Median :   299548   Median :   8901  
##  17.04.12: 200   Mean   :20.06   Mean   :  1322532   Mean   :  48448  
##  17.05.12: 200   3rd Qu.:25.00   3rd Qu.:   951049   3rd Qu.:  28695  
##  17.06.12: 200   Max.   :29.00   Max.   :149376127   Max.   :3093544  
##  (Other) :6600                                                        
##     dislikes         comment_count    comments_disabled ratings_disabled
##  Min.   :      0.0   Min.   :     0   False:7638        False:7762      
##  1st Qu.:     78.0   1st Qu.:   270   True : 162        True :  38      
##  Median :    305.5   Median :  1010                                     
##  Mean   :   3168.3   Mean   :  6114                                     
##  3rd Qu.:   1038.0   3rd Qu.:  3281                                     
##  Max.   :1643059.0   Max.   :827755                                     
##                                                                         
##  video_error_or_removed
##  False:7799            
##  True :   1            
##                        
##                        
##                        
##                        
##

Assigning rank to the top trending videos of a day

toptrend.df$rank=c(1:200)
View(toptrend.df)

library(psych)

## Warning: package 'psych' was built under R version 3.4.3

describe(toptrend.df[,c(8,9,10,11)])

##               vars    n       mean         sd   median   trimmed       mad
## views            1 7800 1322531.64 5583792.65 299548.5 518626.71 387107.60
## likes            2 7800   48447.69  182830.40   8901.0  16775.61  12386.38
## dislikes         3 7800    3168.35   43058.34    305.5    577.79    408.46
## comment_count    4 7800    6113.48   33087.42   1010.0   1834.47   1341.75
##               min       max     range  skew kurtosis       se
## views         687 149376127 149375440 14.13   259.15 63224.00
## likes           0   3093544   3093544 10.05   123.61  2070.15
## dislikes        0   1643059   1643059 32.23  1102.97   487.54
## comment_count   0    827755    827755 16.50   333.03   374.64

Count Number of Tags and store it in a column called numtags

toptrend.df$tags=as.character(toptrend.df$tags)

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.3

toptrend.df$numtags<-str_count(toptrend.df$tags,'\\w+')

View(toptrend.df)

Count number of words in description and store it in a column called numworddes

library(stringr)
toptrend.df$numworddes<-str_count(toptrend.df$description,'\\w+')
View(toptrend.df)

Convert the factor trending_date to date

library(lubridate)

## Warning: package 'lubridate' was built under R version 3.4.3

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

str(toptrend.df$trending_date)

##  Factor w/ 39 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...

toptrend.df$trending_date=as.Date(trending_date, format = "%Y.%d.%m")
View(toptrend.df)
str(toptrend.df$trending_date)

##  Date[1:7800], format: "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" ...

Publish date

library(lubridate)
toptrend.df$publish_time=as.Date(publish_time, format = "%Y-%m-%d")
View(toptrend.df)
str(toptrend.df$publish_time)

##  Date[1:7800], format: "2017-11-13" "2017-11-13" "2017-11-12" "2017-11-13" "2017-11-12" ...

Date Difference

str(toptrend.df$trending_date)

##  Date[1:7800], format: "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" ...

str(toptrend.df$publish_time)

##  Date[1:7800], format: "2017-11-13" "2017-11-13" "2017-11-12" "2017-11-13" "2017-11-12" ...

toptrend.df$datediff=toptrend.df$trending_date-toptrend.df$publish_time+730486
## I had to add this constant because the year of trending_date in the dataset is 0017, which should have been 2017.

toptrend.df$datediff=as.numeric(toptrend.df$datediff,units="days")
str(toptrend.df$datediff)

##  num [1:7800] 2 2 3 2 3 2 3 3 2 2 ...

View(toptrend.df)

Splitting the dataset day-wise

n = 200
nr = nrow(toptrend.df)
day.df=split(toptrend.df, rep(1:ceiling(nr/n), each=n, length.out=nr))

Visualisations

par(mfrow=c(1,2))
boxplot(toptrend.df$datediff,main="Boxplot of Number of days from publish date",ylab="days")
boxplot(toptrend.df$datediff,ylim=c(0,20),ylab="days")

From the second boxplot, we can observe that most of the top trending videos are around 5 days old

par(mfrow=c(1,2))
boxplot(toptrend.df$views,main="Boxplot of Number of Views",ylab="Views")
boxplot(toptrend.df$views,ylim=c(0,1000000),ylab="Views")

From the second boxplot, we can observe that most of the top trending videos are around 5 days old

par(mfrow=c(1,2))
boxplot(toptrend.df$likes,main="Boxplot of Number of likes",ylab="likes")
boxplot(toptrend.df$likes,ylim=c(0,50000),ylab="likes")

From the second boxplot, we can observe that most of the top trending videos have around 10k likes.

par(mfrow=c(1,2))
boxplot(toptrend.df$dislikes,main="Boxplot of Number of dislikes",ylab="dislikes")
boxplot(toptrend.df$dislikes,ylim=c(0,3000),ylab="dislikes")

From the second boxplot, we can observe that most of the top trending videos have around 500 dislikes

par(mfrow=c(1,2))
boxplot(toptrend.df$comment_count,main="Boxplot of Number of Comments",ylab="Comment_count")
boxplot(toptrend.df$comment_count,ylim=c(0,5000),ylab="comment_count")

From the second boxplot, we can observe that most of the top trending videos have around 1000 comments on them.

#par(mfrow=c(2,1))
boxplot(toptrend.df$category_id,main="Boxplot of Category_id",ylab="Category_id")

hist(toptrend.df$category_id,ylim=c(0,3000),breaks=30,main="Histogram to see concentration of all category I'd videos on top trending list",cex=.5)

From the above visualisations, we can observe that categories 21-28 contribute most of the top trending videos.

library(car)

## Warning: package 'car' was built under R version 3.4.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplot(toptrend.df$rank~toptrend.df$numtags,main="Rank vs Number of Tags",xlab="Number of tags",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$numworddes,main="Rank vs Length of description",xlab="Number of words in description",ylab="Rank")

This plot shows that there is no observable relation between rank and length of description

scatterplot(toptrend.df$rank~toptrend.df$datediff,xlim=c(0,20),main="Rank vs Number of days from publishing",xlab="Days from publishing",ylab="Rank")

The above plot shows there is a strong correlation between rank and days from publishing. The fewer the days, the higher(better) the rank

scatterplot(toptrend.df$rank~toptrend.df$comment_count,main="Rank vs Number of Comments",xlab="Number of comments",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$comment_count,xlim=c(0,50000),main="Rank vs Number of comments",xlab="Number of comments",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$views,main="Rank vs Number of Views",xlab="Number of views",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$views,xlim=c(0,10000000),main="Rank vs Number of views",xlab="Number of views",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$likes,main="Rank vs Number of likes",xlab="Number of likes",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$likes,xlim=c(0,200000),main="Rank vs Number of likes",xlab="Number of likes",ylab="Rank")

As the number of likes increases, the rank gets higher. For videos with a very high number of likes, which surely means that they were published some time ago, the rank gets lower with incresing likes. The reason is surely that the video becomes older and newer videos mostly dominate the top trending ranking system on YouTube.

scatterplot(toptrend.df$rank~toptrend.df$dislikes,main="Rank vs Number of dislikes",xlab="Number of dislikes",ylab="Rank")

scatterplot(toptrend.df$rank~toptrend.df$dislikes,xlim=c(0,50000),main="Rank vs Number of dislikes",xlab="Number of dislikes",ylab="Rank")

Surprisingly, more number of dislikes on a video may lead to higher rank!