toptrend.df=read.csv(paste("USvideos.csv",sep=""))
attach(toptrend.df)
View(toptrend.df)
str(toptrend.df)
## 'data.frame': 7800 obs. of 16 variables:
## $ video_id : Factor w/ 1909 levels "-1yT-K3c6YI",..: 141 121 226 1335 569 767 166 1176 954 1577 ...
## $ trending_date : Factor w/ 39 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ title : Factor w/ 1927 levels "'I have taken poison' claims war criminal",..: 1809 1677 1342 1231 788 42 1391 78 1615 1882 ...
## $ channel_title : Factor w/ 992 levels "12 News","1theK (ì›ë”ì¼€ì´)",..: 143 491 738 339 643 399 748 207 3 956 ...
## $ category_id : int 22 24 23 24 24 28 24 28 1 25 ...
## $ publish_time : Factor w/ 1875 levels "2008-06-17T00:07:56.000Z",..: 260 229 213 233 211 265 198 216 239 237 ...
## $ tags : Factor w/ 1823 levels "[none]","08282016NtflxUSCAN|Black Mirror|Netflix|Netflix Original Series|San Junipero|Charlie Brooker|Arkangel|USS Calli"| __truncated__,..: 1388 905 1271 1320 1341 708 1430 20 1673 1743 ...
## $ views : int 748374 2418783 3191434 343168 2095731 119180 2103417 817732 826059 256426 ...
## $ likes : int 57527 97185 146033 10172 132235 9763 15993 23663 3543 12654 ...
## $ dislikes : int 2966 6146 5339 666 1989 511 2445 778 119 1363 ...
## $ comment_count : int 15954 12703 8181 2146 17518 1434 1970 3432 340 2368 ...
## $ thumbnail_link : Factor w/ 1909 levels "https://i.ytimg.com/vi/-1yT-K3c6YI/default.jpg",..: 141 121 226 1335 569 767 166 1176 954 1577 ...
## $ comments_disabled : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ ratings_disabled : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ video_error_or_removed: Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ description : Factor w/ 1970 levels "","'A curious cat helps his owner with home improvements.'\\nWe're releasing a NEW BLACK & WHITE episode every wee"| __truncated__,..: 1380 1209 1827 1754 717 1794 432 747 817 499 ...
summary(toptrend.df[,c(2,5,8,9,10,11,13,14,15)])
## trending_date category_id views likes
## 17.01.12: 200 Min. : 1.00 Min. : 687 Min. : 0
## 17.02.12: 200 1st Qu.:17.00 1st Qu.: 84184 1st Qu.: 2018
## 17.03.12: 200 Median :24.00 Median : 299548 Median : 8901
## 17.04.12: 200 Mean :20.06 Mean : 1322532 Mean : 48448
## 17.05.12: 200 3rd Qu.:25.00 3rd Qu.: 951049 3rd Qu.: 28695
## 17.06.12: 200 Max. :29.00 Max. :149376127 Max. :3093544
## (Other) :6600
## dislikes comment_count comments_disabled ratings_disabled
## Min. : 0.0 Min. : 0 False:7638 False:7762
## 1st Qu.: 78.0 1st Qu.: 270 True : 162 True : 38
## Median : 305.5 Median : 1010
## Mean : 3168.3 Mean : 6114
## 3rd Qu.: 1038.0 3rd Qu.: 3281
## Max. :1643059.0 Max. :827755
##
## video_error_or_removed
## False:7799
## True : 1
##
##
##
##
##
toptrend.df$rank=c(1:200)
View(toptrend.df)
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(toptrend.df[,c(8,9,10,11)])
## vars n mean sd median trimmed mad
## views 1 7800 1322531.64 5583792.65 299548.5 518626.71 387107.60
## likes 2 7800 48447.69 182830.40 8901.0 16775.61 12386.38
## dislikes 3 7800 3168.35 43058.34 305.5 577.79 408.46
## comment_count 4 7800 6113.48 33087.42 1010.0 1834.47 1341.75
## min max range skew kurtosis se
## views 687 149376127 149375440 14.13 259.15 63224.00
## likes 0 3093544 3093544 10.05 123.61 2070.15
## dislikes 0 1643059 1643059 32.23 1102.97 487.54
## comment_count 0 827755 827755 16.50 333.03 374.64
library(stringr)
toptrend.df$numworddes<-str_count(toptrend.df$description,'\\w+')
View(toptrend.df)
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.3
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
str(toptrend.df$trending_date)
## Factor w/ 39 levels "17.01.12","17.02.12",..: 14 14 14 14 14 14 14 14 14 14 ...
toptrend.df$trending_date=as.Date(trending_date, format = "%Y.%d.%m")
View(toptrend.df)
str(toptrend.df$trending_date)
## Date[1:7800], format: "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" ...
library(lubridate)
toptrend.df$publish_time=as.Date(publish_time, format = "%Y-%m-%d")
View(toptrend.df)
str(toptrend.df$publish_time)
## Date[1:7800], format: "2017-11-13" "2017-11-13" "2017-11-12" "2017-11-13" "2017-11-12" ...
str(toptrend.df$trending_date)
## Date[1:7800], format: "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" "0017-11-14" ...
str(toptrend.df$publish_time)
## Date[1:7800], format: "2017-11-13" "2017-11-13" "2017-11-12" "2017-11-13" "2017-11-12" ...
toptrend.df$datediff=toptrend.df$trending_date-toptrend.df$publish_time+730486
## I had to add this constant because the year of trending_date in the dataset is 0017, which should have been 2017.
toptrend.df$datediff=as.numeric(toptrend.df$datediff,units="days")
str(toptrend.df$datediff)
## num [1:7800] 2 2 3 2 3 2 3 3 2 2 ...
View(toptrend.df)
n = 200
nr = nrow(toptrend.df)
day.df=split(toptrend.df, rep(1:ceiling(nr/n), each=n, length.out=nr))
par(mfrow=c(1,2))
boxplot(toptrend.df$datediff,main="Boxplot of Number of days from publish date",ylab="days")
boxplot(toptrend.df$datediff,ylim=c(0,20),ylab="days")
From the second boxplot, we can observe that most of the top trending videos are around 5 days old
par(mfrow=c(1,2))
boxplot(toptrend.df$views,main="Boxplot of Number of Views",ylab="Views")
boxplot(toptrend.df$views,ylim=c(0,1000000),ylab="Views")
From the second boxplot, we can observe that most of the top trending videos are around 5 days old
par(mfrow=c(1,2))
boxplot(toptrend.df$likes,main="Boxplot of Number of likes",ylab="likes")
boxplot(toptrend.df$likes,ylim=c(0,50000),ylab="likes")
From the second boxplot, we can observe that most of the top trending videos have around 10k likes.
par(mfrow=c(1,2))
boxplot(toptrend.df$dislikes,main="Boxplot of Number of dislikes",ylab="dislikes")
boxplot(toptrend.df$dislikes,ylim=c(0,3000),ylab="dislikes")
From the second boxplot, we can observe that most of the top trending videos have around 500 dislikes
par(mfrow=c(1,2))
boxplot(toptrend.df$comment_count,main="Boxplot of Number of Comments",ylab="Comment_count")
boxplot(toptrend.df$comment_count,ylim=c(0,5000),ylab="comment_count")
From the second boxplot, we can observe that most of the top trending videos have around 1000 comments on them.
#par(mfrow=c(2,1))
boxplot(toptrend.df$category_id,main="Boxplot of Category_id",ylab="Category_id")
hist(toptrend.df$category_id,ylim=c(0,3000),breaks=30,main="Histogram to see concentration of all category I'd videos on top trending list",cex=.5)
From the above visualisations, we can observe that categories 21-28 contribute most of the top trending videos.
library(car)
## Warning: package 'car' was built under R version 3.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(toptrend.df$rank~toptrend.df$numtags,main="Rank vs Number of Tags",xlab="Number of tags",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$numworddes,main="Rank vs Length of description",xlab="Number of words in description",ylab="Rank")
This plot shows that there is no observable relation between rank and length of description
scatterplot(toptrend.df$rank~toptrend.df$datediff,xlim=c(0,20),main="Rank vs Number of days from publishing",xlab="Days from publishing",ylab="Rank")
The above plot shows there is a strong correlation between rank and days from publishing. The fewer the days, the higher(better) the rank
scatterplot(toptrend.df$rank~toptrend.df$comment_count,main="Rank vs Number of Comments",xlab="Number of comments",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$comment_count,xlim=c(0,50000),main="Rank vs Number of comments",xlab="Number of comments",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$views,main="Rank vs Number of Views",xlab="Number of views",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$views,xlim=c(0,10000000),main="Rank vs Number of views",xlab="Number of views",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$likes,main="Rank vs Number of likes",xlab="Number of likes",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$likes,xlim=c(0,200000),main="Rank vs Number of likes",xlab="Number of likes",ylab="Rank")
As the number of likes increases, the rank gets higher. For videos with a very high number of likes, which surely means that they were published some time ago, the rank gets lower with incresing likes. The reason is surely that the video becomes older and newer videos mostly dominate the top trending ranking system on YouTube.
scatterplot(toptrend.df$rank~toptrend.df$dislikes,main="Rank vs Number of dislikes",xlab="Number of dislikes",ylab="Rank")
scatterplot(toptrend.df$rank~toptrend.df$dislikes,xlim=c(0,50000),main="Rank vs Number of dislikes",xlab="Number of dislikes",ylab="Rank")
Surprisingly, more number of dislikes on a video may lead to higher rank!