This data is collection of SAPM and Good Comments of 5 videos, data colelcted duration is ranging between 2013 to 2015. We will try to answer:
This data is collection of SAPM and Good Comments of 5 videos, data colelcted duration is ranging between 2013 to 2015.
data.table
suppressWarnings(library(tidyverse))
suppressWarnings(library(stringr))
suppressWarnings(library(data.table))
library(plyr)
library(XML)
library(crayon)
library(lubridate)
library(rjson)
library(readxl)
Here we have created and loaded all the file in directory of Project in our local variable fdataMaster, fdataEninem, fdataKatyPerry, fdataLMFAO, fdataShakira.
# Read Data folder called Project in current working directory, which hold all the file FIVE csv files and ONE excel master file.
workDir <- getwd()
filePath = paste0(workDir,"/project")
fileName <- list.files(path=filePath)
SpamData <- data.frame(NA)
# Start of code to read data from folder
for (n in fileName){
#Get the File name and key
tempName <- str_split(n,"-",n = 2,simplify = TRUE)[1,2]
key <- str_split(tempName,"(.xlsx)|(.csv)",n=2,simplify = TRUE)[1,1]
if(str_detect(n,".xlsx$")){
#Using read excel from readxl package
# We know its Master file
assign(paste0("fdata",key),read_excel(paste0(filePath,"/",n)))
} else{
# We know its our record data file for each item in Master file
assign(paste0("fdata",key),read_csv(paste0(filePath,"/",n)))
}
print(paste0("fdata",key))
}## Multiple files in zip: reading 'Youtube01-Psy.csv'
## Parsed with column specification:
## cols(
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double()
## )
## [1] "fdata"
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double(),
## key = col_character(),
## YID = col_character(),
## YEAR = col_double(),
## DAY = col_double(),
## WDAY = col_character(),
## HOUR = col_double(),
## TIME = col_time(format = ""),
## ConDupcount = col_double(),
## AuthDupcount = col_double()
## )
## [1] "fdata"
## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.4.2.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.
## [1] "fdataMaster"
## Parsed with column specification:
## cols(
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double()
## )
## [1] "fdataPsy"
## Parsed with column specification:
## cols(
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double()
## )
## [1] "fdataKatyPerry"
## Parsed with column specification:
## cols(
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double()
## )
## [1] "fdataLMFAO"
## Parsed with column specification:
## cols(
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double()
## )
## [1] "fdataEminem"
## Parsed with column specification:
## cols(
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double()
## )
## [1] "fdataShakira"
# End of loading data # Function to show Glimpse of the data
showGlimpse <- function(x = NULL){
if(!is.null(x)){
print(paste0("--------------Start of %s-----------------------",x))
print(eval(head(parse(text = x))))
glimpse(x)
print(paste0("--------------End of %s-----------------------",x))
cat("\n")
return()}
print("Data--------------Start of fdataMaster-----------------------")
print(head(fdataMaster))
print("Data--------------End of fdataMaster-----------------------")
cat("\n")
print("Data--------------Start of fdataEminem-----------------------")
glimpse(fdataEminem)
print("Data--------------End of fdataEminem-----------------------")
cat("\n")
print("Data--------------Start of fdataShakira-----------------------")
glimpse(fdataShakira)
print("Data--------------End of fdataShakira-----------------------")
cat("\n")
print("Data--------------Start of fdataKatyPerry-----------------------")
glimpse(fdataKatyPerry)
print("Data--------------End of fdataKatyPerry-----------------------")
cat("\n")
print("Data--------------Start of fdataLMFAO-----------------------")
glimpse(fdataLMFAO)
print("Data--------------End of fdataLMFAO-----------------------")
cat("\n")
print("Data--------------Start of fdataPsy-----------------------")
glimpse(fdataPsy)
print("Data--------------End of fdataPsy-----------------------")
cat("\n")
print("Data--------------Start of SpamData-----------------------")
glimpse(SpamData)
print("Data--------------End of SpamData-----------------------")
cat("\n")
}
showGlimpse()## [1] "Data--------------Start of fdataMaster-----------------------"
## # A tibble: 5 x 5
## Dataset `YouTube ID` Spam Ham Total
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Psy 9bZkp7q19f0 175 175 350
## 2 KatyPerry CevxZvSJLk8 175 175 350
## 3 LMFAO KQ6zr6kCPj8 236 202 438
## 4 Eminem uelHwf8o7_U 245 203 448
## 5 Shakira pRpeEdMmmQ0 174 196 370
## [1] "Data--------------End of fdataMaster-----------------------"
##
## [1] "Data--------------Start of fdataEminem-----------------------"
## Observations: 448
## Variables: 5
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## [1] "Data--------------End of fdataEminem-----------------------"
##
## [1] "Data--------------Start of fdataShakira-----------------------"
## Observations: 370
## Variables: 5
## $ COMMENT_ID <chr> "z13lgffb5w3ddx1ul22qy1wxspy5cpkz504", "z123dbgb0mq...
## $ AUTHOR <chr> "dharma pal", "Tiza Arellano", "Prìñçess Âlis Løvê ...
## $ DATE <dttm> 2015-05-29 02:30:18, 2015-05-29 00:14:48, 2015-05-...
## $ CONTENT <chr> "Nice song<U+FEFF>", "I love song <U+FEFF>", "I love song <U+FEFF>", "86...
## $ CLASS <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## [1] "Data--------------End of fdataShakira-----------------------"
##
## [1] "Data--------------Start of fdataKatyPerry-----------------------"
## Observations: 350
## Variables: 5
## $ COMMENT_ID <chr> "z12pgdhovmrktzm3i23es5d5junftft3f", "z13yx345uxepe...
## $ AUTHOR <chr> "lekanaVEVO1", "Pyunghee", "Erica Ross", "Aviel Hai...
## $ DATE <dttm> 2014-07-22 15:27:50, 2014-07-27 01:57:16, 2014-07-...
## $ CONTENT <chr> "i love this so much. AND also I Generate Free Lead...
## $ CLASS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## [1] "Data--------------End of fdataKatyPerry-----------------------"
##
## [1] "Data--------------Start of fdataLMFAO-----------------------"
## Observations: 438
## Variables: 5
## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
## $ AUTHOR <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
## $ DATE <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
## $ CONTENT <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
## $ CLASS <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## [1] "Data--------------End of fdataLMFAO-----------------------"
##
## [1] "Data--------------Start of fdataPsy-----------------------"
## Observations: 350
## Variables: 5
## $ COMMENT_ID <chr> "LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU", "LZQ...
## $ AUTHOR <chr> "Julius NM", "adam riyati", "Evgeny Murashkin", "El...
## $ DATE <dttm> 2013-11-07 06:20:48, 2013-11-07 12:37:15, 2013-11-...
## $ CONTENT <chr> "Huh, anyway check out this you[tube] channel: koby...
## $ CLASS <dbl> 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## [1] "Data--------------End of fdataPsy-----------------------"
##
## [1] "Data--------------Start of SpamData-----------------------"
## Observations: 1
## Variables: 1
## $ NA. <lgl> NA
## [1] "Data--------------End of SpamData-----------------------"
# AddKey and Name as colum to dataset, so that we can put all the data in 1 file
# Here I will load the data and then check if this data is availble in Master file
# If yes then add two column at the begning of the dataset with name "Key" for
# dataset "YID" for youtube id.
fdataKatyPerry <- mutate(fdataKatyPerry,key= "KatyPerry", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "KatyPerry"),][1,2]))
fdataEminem <- mutate(fdataEminem,key= "Eminem", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Eminem"),][1,2]))
fdataLMFAO <- mutate(fdataLMFAO,key= "LMFAO", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "LMFAO"),][1,2]))
fdataPsy <- mutate(fdataLMFAO,key= "Psy", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Psy"),][1,2]))
fdataShakira <- mutate(fdataShakira,key= "Shakira", YID=as.character(fdataMaster[which(fdataMaster$Dataset == "Shakira"),][1,2]))
showGlimpse() # Check tibbles to see if they have all the new columns## [1] "Data--------------Start of fdataMaster-----------------------"
## # A tibble: 5 x 5
## Dataset `YouTube ID` Spam Ham Total
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Psy 9bZkp7q19f0 175 175 350
## 2 KatyPerry CevxZvSJLk8 175 175 350
## 3 LMFAO KQ6zr6kCPj8 236 202 438
## 4 Eminem uelHwf8o7_U 245 203 448
## 5 Shakira pRpeEdMmmQ0 174 196 370
## [1] "Data--------------End of fdataMaster-----------------------"
##
## [1] "Data--------------Start of fdataEminem-----------------------"
## Observations: 448
## Variables: 7
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## $ key <chr> "Eminem", "Eminem", "Eminem", "Eminem", "Eminem", "...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...
## [1] "Data--------------End of fdataEminem-----------------------"
##
## [1] "Data--------------Start of fdataShakira-----------------------"
## Observations: 370
## Variables: 7
## $ COMMENT_ID <chr> "z13lgffb5w3ddx1ul22qy1wxspy5cpkz504", "z123dbgb0mq...
## $ AUTHOR <chr> "dharma pal", "Tiza Arellano", "Prìñçess Âlis Løvê ...
## $ DATE <dttm> 2015-05-29 02:30:18, 2015-05-29 00:14:48, 2015-05-...
## $ CONTENT <chr> "Nice song<U+FEFF>", "I love song <U+FEFF>", "I love song <U+FEFF>", "86...
## $ CLASS <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ key <chr> "Shakira", "Shakira", "Shakira", "Shakira", "Shakir...
## $ YID <chr> "pRpeEdMmmQ0", "pRpeEdMmmQ0", "pRpeEdMmmQ0", "pRpeE...
## [1] "Data--------------End of fdataShakira-----------------------"
##
## [1] "Data--------------Start of fdataKatyPerry-----------------------"
## Observations: 350
## Variables: 7
## $ COMMENT_ID <chr> "z12pgdhovmrktzm3i23es5d5junftft3f", "z13yx345uxepe...
## $ AUTHOR <chr> "lekanaVEVO1", "Pyunghee", "Erica Ross", "Aviel Hai...
## $ DATE <dttm> 2014-07-22 15:27:50, 2014-07-27 01:57:16, 2014-07-...
## $ CONTENT <chr> "i love this so much. AND also I Generate Free Lead...
## $ CLASS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ key <chr> "KatyPerry", "KatyPerry", "KatyPerry", "KatyPerry",...
## $ YID <chr> "CevxZvSJLk8", "CevxZvSJLk8", "CevxZvSJLk8", "CevxZ...
## [1] "Data--------------End of fdataKatyPerry-----------------------"
##
## [1] "Data--------------Start of fdataLMFAO-----------------------"
## Observations: 438
## Variables: 7
## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
## $ AUTHOR <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
## $ DATE <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
## $ CONTENT <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
## $ CLASS <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ key <chr> "LMFAO", "LMFAO", "LMFAO", "LMFAO", "LMFAO", "LMFAO...
## $ YID <chr> "KQ6zr6kCPj8", "KQ6zr6kCPj8", "KQ6zr6kCPj8", "KQ6zr...
## [1] "Data--------------End of fdataLMFAO-----------------------"
##
## [1] "Data--------------Start of fdataPsy-----------------------"
## Observations: 438
## Variables: 7
## $ COMMENT_ID <chr> "z13uwn2heqndtr5g304ccv5j5kqqzxjadmc0k", "z124jvcza...
## $ AUTHOR <chr> "Corey Wilson", "Epic Gaming", "LaS Music", "Cheryl...
## $ DATE <dttm> 2015-05-28 21:39:52, 2015-05-28 20:07:20, 2015-05-...
## $ CONTENT <chr> "<a href=\"http://www.youtube.com/watch?v=KQ6zr6kCP...
## $ CLASS <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ key <chr> "Psy", "Psy", "Psy", "Psy", "Psy", "Psy", "Psy", "P...
## $ YID <chr> "9bZkp7q19f0", "9bZkp7q19f0", "9bZkp7q19f0", "9bZkp...
## [1] "Data--------------End of fdataPsy-----------------------"
##
## [1] "Data--------------Start of SpamData-----------------------"
## Observations: 1
## Variables: 1
## $ NA. <lgl> NA
## [1] "Data--------------End of SpamData-----------------------"
# Now we will use rbind to put file in 1 dataframe "SpamData"
SpamData <- rbind(fdataEminem,fdataKatyPerry,fdataLMFAO,fdataPsy,fdataShakira)
head(SpamData)showGlimpse("SpamData")## [1] "--------------Start of %s-----------------------SpamData"
## # A tibble: 2,044 x 7
## COMMENT_ID AUTHOR DATE CONTENT CLASS key YID
## <chr> <chr> <dttm> <chr> <dbl> <chr> <chr>
## 1 z12rwfnyyrbs~ Lisa W~ NA +4479354541~ 1 Emin~ uelH~
## 2 z130wpnwwnyu~ jason ~ 2015-05-29 02:26:10 I always en~ 0 Emin~ uelH~
## 3 z13vsfqirtav~ Ajkal ~ NA "my sister ~ 1 Emin~ uelH~
## 4 z12wjzc4eprn~ Dakota~ 2015-05-29 02:13:07 Cool<U+FEFF> 0 Emin~ uelH~
## 5 z13xjfr42z3u~ Jihad ~ NA Hello I'~ 1 Emin~ uelH~
## 6 z133yfmjdur4~ Darrio~ 2015-05-29 01:27:30 Wow this vi~ 0 Emin~ uelH~
## 7 z12zgrw5furd~ kyeman~ NA Go check ou~ 1 Emin~ uelH~
## 8 z12vxdzzds2k~ Damax 2015-05-29 00:41:22 Almost 1 bi~ 0 Emin~ uelH~
## 9 z12gxdortqzw~ Muhamm~ NA Aslamu Lyku~ 1 Emin~ uelH~
## 10 z132wd4ywmic~ JuanPa~ 2015-05-28 23:23:41 Eminem is i~ 0 Emin~ uelH~
## # ... with 2,034 more rows
## chr "SpamData"
## [1] "--------------End of %s-----------------------SpamData"
## NULL
#Lets check the strucutre of column
glimpse(SpamData)## Observations: 2,044
## Variables: 7
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## $ key <chr> "Eminem", "Eminem", "Eminem", "Eminem", "Eminem", "...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...
#Converting Key as factor
SpamData$key = as.factor(SpamData$key)
glimpse(SpamData)## Observations: 2,044
## Variables: 7
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwnyue...
## $ AUTHOR <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dakot...
## $ DATE <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I alw...
## $ CLASS <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, ...
## $ key <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, Emi...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uelHw...
# Check Unique of Video , indicates we have got data for all the videos in one dataset
unique(SpamData$key)## [1] Eminem KatyPerry LMFAO Psy Shakira
## Levels: Eminem KatyPerry LMFAO Psy Shakira
#Check the Dataset duration
unique(year(SpamData$DATE))## [1] NA 2015 2014 2013
#add MOnth Column
SpamData$YEAR <- as.factor(year(SpamData$DATE))
SpamData$DAY <- as.character(month(SpamData$DATE))
SpamData$WDAY <- as.factor(weekdays(SpamData$DATE))
#See the time of Day
unique(hour(SpamData$DATE))## [1] NA 2 1 0 23 22 21 20 19 18 17 16 15 11 7 6 3 9 4 12 10 8 5
## [24] 14 13
# Add Time and Hour from Date column
SpamData$HOUR <- hour(SpamData$DATE)
SpamData$TIME <- str_sub(SpamData$DATE,12,end = -1L) # Time starts at 12th place in string of date,#Lets start working with Author Data now, We don't expect any duplicate but checking
dups_Author <- which(duplicated(SpamData$AUTHOR))
# this looks like we have some same comment put on multiple videos (not checking as of now if it's by same by same user. )
dup_comments <- which(duplicated(SpamData$CONTENT))
#################################### Adding count for Same Comments
SpamData_ccount <- count(SpamData$CONTENT)
SpamData$ConDupcount <- mapply(function(x)(SpamData_ccount[which(SpamData_ccount$x==x),][1,2]),SpamData$CONTENT)
#################################### Adding count for Same Authors
SpamData_tcount <- count(SpamData$AUTHOR)
SpamData$AuthDupcount <- mapply(function(x)(SpamData_tcount[which(SpamData_tcount$x==x),][1,2]),SpamData$AUTHOR)
####################################
#SO we have few Data which are with NA date, it may not add much vlaue in our analysis Lets park this data in other dataset
#Create TWO New Data set with ONly data with valid Data one with VALIDSPAM and other with NASPAM where date is NA
suppressWarnings(rm(validSPAM) ) # Remove Old Variable \
suppressWarnings(rm(NASPAM)) # Remove Old Variable \
# Creating new dataset with all the columns from MAIN data
validSPAM <- SpamData[which(SpamData$DAY != 'NA'),]
NASPAM <- SpamData[-which(SpamData$DAY != 'NA'),]
glimpse(SpamData)## Observations: 2,044
## Variables: 14
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwny...
## $ AUTHOR <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dak...
## $ DATE <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I a...
## $ CLASS <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1...
## $ key <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR <fct> NA, 2015, NA, 2015, NA, 2015, NA, 2015, NA, 2015,...
## $ DAY <chr> NA, "5", NA, "5", NA, "5", NA, "5", NA, "5", NA, ...
## $ WDAY <fct> NA, Friday, NA, Friday, NA, Friday, NA, Friday, N...
## $ HOUR <int> NA, 2, NA, 2, NA, 1, NA, 0, NA, 23, NA, 23, NA, 2...
## $ TIME <chr> NA, "02:26:10", NA, "02:13:07", NA, "01:27:30", N...
## $ ConDupcount <int> 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
glimpse(validSPAM)## Observations: 1,799
## Variables: 14
## $ COMMENT_ID <chr> "z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04", "z12wjzc4e...
## $ AUTHOR <chr> "jason graham", "Dakota Taylor", "Darrion Johnson...
## $ DATE <dttm> 2015-05-29 02:26:10, 2015-05-29 02:13:07, 2015-0...
## $ CONTENT <chr> "I always end up coming back to this song<br /><U+FEFF>"...
## $ CLASS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ key <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2...
## $ DAY <chr> "5", "5", "5", "5", "5", "5", "5", "5", "5", "5",...
## $ WDAY <fct> Friday, Friday, Friday, Friday, Thursday, Thursda...
## $ HOUR <int> 2, 2, 1, 0, 23, 23, 22, 22, 21, 21, 20, 20, 20, 1...
## $ TIME <chr> "02:26:10", "02:13:07", "01:27:30", "00:41:22", "...
## $ ConDupcount <int> 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
glimpse(NASPAM)## Observations: 245
## Variables: 14
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z13vsfqirta...
## $ AUTHOR <chr> "Lisa Wellas", "Ajkal Khan", "Jihad Naser", "kyem...
## $ DATE <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "my ...
## $ CLASS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ key <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ DAY <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ WDAY <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ HOUR <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ TIME <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ ConDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 36, 1, 1, 1, 36, 1,...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
# Converting 0 and 1 to Text "NO-SPAM", "SPAM" and create new dataset
SpamData_text <- mutate(SpamData,CLASS= ifelse(CLASS == 0, "NO-SPAM", "SPAM"))
head(SpamData_text[,c("CLASS","CONTENT")])# SPAM DATA By Year and Type
SpamData_Content <- SpamData_text %>% select("CLASS","CONTENT","YEAR","DAY")Read JSON File from www to perfrom str_detect on the collection of badwords.
# Get bad word JSON File from GITHUB to do you string compare
#install.packages("rjson")
badwordURL <- "https://raw.githubusercontent.com/web-mech/badwords/master/lib/lang.json"
#library(rjson)
suppressWarnings(rm(dontSay))
raw <- read_file(badwordURL)
dontSay <- fromJSON(raw)
class(dontSay)## [1] "list"
dontSay <- as.data.frame(dontSay$words)
colnames(dontSay) <- c("words")
dontSay <- as.character(dontSay$words)
# Replace and a all special char with scapce sequence in bad word # . \ | ( ) [ { ^ $ * + ? replace
dontSay <- gsub("[(]","\\\\(",dontSay)
dontSay <- gsub("[.]","\\\\.",dontSay)
dontSay <- gsub("[\\]","\\\\\\",dontSay)
dontSay <- gsub("[|]","\\\\|",dontSay)
dontSay <- gsub("[)]","\\\\)",dontSay)
dontSay <- gsub("[*]","\\\\*",dontSay)
#since Str_detect neeeds 1d varible to check using pase with collapse
checkword <- paste(dontSay, collapse = '|')
dontSay_data <- SpamData_Content[which(str_detect(SpamData_Content$CONTENT,pattern =regex(checkword,ignore_case = TRUE)) ),]
dontSay_data$CLASS <- as.factor(dontSay_data$CLASS)
summary(dontSay_data)## CLASS CONTENT YEAR DAY
## NO-SPAM: 72 Length:190 2013:25 Length:190
## SPAM :118 Class :character 2014:51 Class :character
## Mode :character 2015:78 Mode :character
## NA's:36
## Message Marked as SPAM and have Bad words.
glimpse(dontSay_data[which(dontSay_data$CLASS=="SPAM"),])## Observations: 118
## Variables: 4
## $ CLASS <fct> SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, SPAM, ...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "my siste...
## $ YEAR <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ DAY <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Adding Bad column to new data set show if comments was bad or no
SpamData_text<- mutate(SpamData_text, bad = str_detect(SpamData_text$CONTENT,pattern =checkword ) )
# summary(SpamData_text)
# plyr::count(SpamData_text ,vars=c("CLASS","bad")) %>%
# plyr:: rename( c("CLASS"="TYPE","bad"="Bad words used","freq"="Number of Comments"))write “SpamData” it to local directory
# write "SpamData" it to local directory
write.csv(SpamData,file = "project/SpamData.csv")Data Exploration: This should include summary statistics, means, medians, quartiles, or any other relevant information about the data set. Please include some conclusions in the R Markdown text.
Doing String operations and creating new data set.
Calcualting mean , median and other info.
glimpse(SpamData)## Observations: 2,044
## Variables: 14
## $ COMMENT_ID <chr> "z12rwfnyyrbsefonb232i5ehdxzkjzjs2", "z130wpnwwny...
## $ AUTHOR <chr> "Lisa Wellas", "jason graham", "Ajkal Khan", "Dak...
## $ DATE <dttm> NA, 2015-05-29 02:26:10, NA, 2015-05-29 02:13:07...
## $ CONTENT <chr> "+447935454150 lovely girl talk to me xxx<U+FEFF>", "I a...
## $ CLASS <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1...
## $ key <fct> Eminem, Eminem, Eminem, Eminem, Eminem, Eminem, E...
## $ YID <chr> "uelHwf8o7_U", "uelHwf8o7_U", "uelHwf8o7_U", "uel...
## $ YEAR <fct> NA, 2015, NA, 2015, NA, 2015, NA, 2015, NA, 2015,...
## $ DAY <chr> NA, "5", NA, "5", NA, "5", NA, "5", NA, "5", NA, ...
## $ WDAY <fct> NA, Friday, NA, Friday, NA, Friday, NA, Friday, N...
## $ HOUR <int> NA, 2, NA, 2, NA, 1, NA, 0, NA, 23, NA, 23, NA, 2...
## $ TIME <chr> NA, "02:26:10", NA, "02:13:07", NA, "01:27:30", N...
## $ ConDupcount <int> 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3...
## $ AuthDupcount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
head(fdataMaster)summary(SpamData)## COMMENT_ID AUTHOR DATE
## Length:2044 Length:2044 Min. :2013-07-12 22:33:27
## Class :character Class :character 1st Qu.:2014-09-25 21:39:20
## Mode :character Mode :character Median :2015-03-29 07:12:48
## Mean :2014-12-25 18:08:03
## 3rd Qu.:2015-05-22 16:27:17
## Max. :2015-06-05 20:01:23
## NA's :245
## CONTENT CLASS key YID
## Length:2044 Min. :0.0000 Eminem :448 Length:2044
## Class :character 1st Qu.:0.0000 KatyPerry:350 Class :character
## Mode :character Median :1.0000 LMFAO :438 Mode :character
## Mean :0.5215 Psy :438
## 3rd Qu.:1.0000 Shakira :370
## Max. :1.0000
##
## YEAR DAY WDAY HOUR
## 2013: 200 Length:2044 Tuesday :309 Min. : 0.00
## 2014: 517 Class :character Saturday :290 1st Qu.: 6.00
## 2015:1082 Mode :character Thursday :272 Median :14.00
## NA's: 245 Wednesday:269 Mean :12.33
## Sunday :242 3rd Qu.:18.00
## (Other) :417 Max. :23.00
## NA's :245 NA's :245
## TIME ConDupcount AuthDupcount
## Length:2044 Min. : 1.00 Min. :1.00
## Class :character 1st Qu.: 1.00 1st Qu.:1.00
## Mode :character Median : 1.00 Median :2.00
## Mean : 16.33 Mean :1.77
## 3rd Qu.: 2.00 3rd Qu.:2.00
## Max. :171.00 Max. :8.00
##
ddply(SpamData, "key",summarise, duration = max(as.numeric(YEAR)) - min(as.numeric(YEAR)))# count the total number of SAPM by key column
# The output of ddply is just the give column as we have user Summarise fucnation, if we use funcation tranform
# SpamDataT <- ddply(SpamData, "key", transform, Spam = sum( CLASS ) ) it would result full data set for use to user
# Data Of Key Video and SPAM Count
# Sine CALSS Is logical and 1 is for SPAM and 0 for NO SPAM , Sum of CLASS would give sum of SAPM content
# Rename the column by doing pipe %>% of the data to rename of plys pacakge
ddply(SpamData, "key", summarise, Spam = sum( CLASS ) ) %>%
rename( c("key"="Video Name","Spam"= " SPAM Count")) # Spam over days of week
ddply(SpamData, "WDAY", summarise, Spam = sum( CLASS ) ) %>%
rename( c("WDAY"="Day of week","Spam"= " SPAM Count")) # Count of each value of "Key" in the first spam data
SpamData_keyCount <- count(SpamData, vars = "key")
# Calculate % of SAPM by video
SpamData_percent <- ddply(SpamData, "key", summarise, Spam = sum( CLASS ) ) %>%
mutate( SPAM_PERCENT = as.numeric(Spam)* 100/SpamData_keyCount$freq[which(SpamData_keyCount$key== key )] ,
TOTAL_COMMENTS = SpamData_keyCount$freq[which(SpamData_keyCount$key== key )])
SpamData_percent[order(SpamData_percent$TOTAL_COMMENTS,decreasing = TRUE),]# Know the Mean of comments from Author and Hours of the day
sapply( list( "Repeat Author" = SpamData$AuthDupcount,"Hour of Day" = SpamData$HOUR ), FUN =mean, na.rm=TRUE) ## Repeat Author Hour of Day
## 1.770059 12.327404
# MEAN Of % of SPAM with respect to total spam
sapply(list("Spam % Mean" = SpamData_percent$SPAM_PERCENT, "| Total Comment Mean"=SpamData_percent$TOTAL_COMMENTS),FUN = mean)## Spam % Mean | Total Comment Mean
## 51.89542 408.80000
# Mean of Hours and Day of the Month
sapply( list( "Hour_Mean" = SpamData$HOUR,"Day_Mean" = day(SpamData$DATE) ), mean, na.rm=TRUE) ## Hour_Mean Day_Mean
## 12.32740 17.98221
# aggregate data by Video Key and Weekday to see How many SPAMs were reported on each daty
aggregate(list("SPAM_COUNT" = SpamData$CLASS ),by = list("ID" = SpamData$key, "Day_Of_Week" = SpamData$WDAY ) , FUN= sum,na.action=FALSE )library(dplyr)
# Total comments by Week Days
SpamData_total_spam_weekday <- plyr::count(SpamData, vars = c("key","WDAY"))
SpamData_total_spam_weekday# Count of COmment by Year
SpamData_total_spam_year <- plyr:: count(SpamData, vars = c("key","YEAR"))
summary(SpamData_total_spam_year)## key YEAR freq
## Eminem :2 2013:1 Min. : 15.0
## KatyPerry:2 2014:3 1st Qu.:110.8
## LMFAO :2 2015:5 Median :201.5
## Psy :2 NA's:1 Mean :204.4
## Shakira :2 3rd Qu.:312.5
## Max. :347.0
#It looks like we have 0 for Eminem at every day of week. Let check it with some sample data of SPAM in Main dataset
SpamData[which(SpamData$CLASS == 1 & SpamData$key == "Eminem"),c("key", "WDAY","CLASS","DATE")]# We see we have NA for Wday and its right as we have NA for DATE , To validate further lets check the Not SAPM data for the Eminem
SpamData[which(SpamData$CLASS == 0 & SpamData$key == "Eminem"),c("key", "WDAY","CLASS","DATE")]# Median of Hours and Day of the Month
sapply( list( "Hour_Median" = SpamData$HOUR,"Day_Median" = day(SpamData$DATE) ), median, na.rm=TRUE)## Hour_Median Day_Median
## 14 20
qunat_SPAM_byDay <- quantile(aggregate(list("SPAM_COUNT" = SpamData$CLASS), by= list("Day" = SpamData$WDAY),sum)$SPAM_COUNT)
sprintf("The first, second and third quartiles of the 'SAPM Count' in SPAM Datas are %s, %s and %s spams respectively.",qunat_SPAM_byDay[1],qunat_SPAM_byDay[2],qunat_SPAM_byDay[3])## [1] "The first, second and third quartiles of the 'SAPM Count' in SPAM Datas are 95, 103 and 116 spams respectively."
# Using Colored Formating
cat("The first, second and third " %+% blue$underline$bold ('quartiles') %+% " of the " %+% red("SAPM Count") %+% " in SPAM Datas are ", blue(qunat_SPAM_byDay[1]),",",blue(qunat_SPAM_byDay[2]), " and " , blue(qunat_SPAM_byDay[3]) , " spams respectively.")## The first, second and third quartiles of the SAPM Count in SPAM Datas are 95 , 103 and 116 spams respectively.
## key YEAR freq
## Eminem :2 2013:1 Min. : 15.0
## KatyPerry:2 2014:3 1st Qu.:110.8
## LMFAO :2 2015:5 Median :201.5
## Psy :2 NA's:1 Mean :204.4
## Shakira :2 3rd Qu.:312.5
## Max. :347.0
| Spam % Mean | Total Comment Mean |
|---|---|
| 51.89542 | 408.80000 |
5 SPAM Data by Weekday : The first, second and third quartiles of the SAPM Count in SPAM Datas are 95 ,103 and 116 spams respectively. + qunat_SPAM_byDay
| 0% | 25% | 50% | 75% | 100% |
|---|---|---|---|---|
| 95 | 103 | 116 | 124 | 156 |
Graphics: Please make sure to display at least one scatter plot, box plot and histogram. Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.
Creating differnt type of data and Graph set that can support us latter for our final data presentation. ## Graphs {.tabset .tabset-fade .tabset}
library(plyr)
head(SpamData_total_spam_year)# Comments Over Year on the Videos
ggplot(SpamData_total_spam_year,mapping=aes(x=key,y=YEAR,fill= YEAR)) +
geom_col()+
ggtitle("Comments Over Year on the Videos")############################ Data by only Key year and Class using SPREAD to move ROW TO COLUMN
SpamData_kcy <- SpamData_text%>%select(key,CLASS,YEAR)
SpamData_kcy_f<- plyr::count(SpamData_kcy ,vars =c("key","YEAR","CLASS")) %>%
spread(CLASS, freq)
## Graph for only SPAM over the video per year , Its very clear that 2014 and 2015 recorded increse in Comments and hence SPAM
ggplot(SpamData_kcy_f,mapping=aes(x=key, y=SPAM, fill = YEAR)) +
geom_col(na.rm = FALSE) +facet_wrap(~YEAR) +
labs(title="SPAM over Year on Videos",
x ="Video ", y = "SPAM Count")+
ggtitle(" Impact of SPAM on Number of Videos by year") +
theme(plot.title = element_text(color="gray", size=14, face="bold" ))## Warning: Removed 1 rows containing missing values (position_stack).
# Impact of SPAM on Number of Videos by year
ggplot(SpamData_total_spam_year,mapping=aes(x=YEAR,fill= freq)) +
geom_bar() +
theme(plot.background = element_rect(color = "orange")) +
theme(panel.background = element_blank()) +
theme(panel.grid.major = element_line(color="blue")) +
theme(panel.grid.minor = element_blank())+
theme(panel.grid.major.x = element_blank())+
ylab( "No Of Videos Impacted") +
xlab( "Year" ) +
ggtitle(" Impact of SPAM on Number of Videos by year") +
theme(plot.title = element_text(color="red", size=14, face="bold.italic" ))## using box plt to show that data was very high in 2014 and 2015
ggplot(SpamData_kcy_f,mapping=aes(x=YEAR, y= SPAM)) +
geom_boxplot()## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
# We can clearly see that Spamming is not a regualar process , it happens in a chunk , with very few exceptions .
ggplot(SpamData_text,mapping=aes(x=key, y=DATE,color = CLASS)) +
geom_boxplot(na.rm = TRUE)+
ylab( "Date in Year") +
xlab( "Video Key" ) +
theme(panel.background = element_blank(),
legend.key = element_blank()) + #Gray color behind the actual legend
theme(panel.grid.major = element_line(color="blue")) +
theme(panel.grid.minor = element_blank())+
theme(panel.grid.major.y = element_blank())+
scale_color_discrete(name="Comment Type")+
ggtitle(" Type of comments on Videos by year") +
theme(plot.title = element_text(color="red", size=12, face="bold" ))#SPAM and NO SPAM BY YEAR
SpamData_byYear <- ddply(SpamData, "YEAR", summarise, Spam = sum( CLASS ) ) %>%
mutate( SPAM_PERCENT = as.numeric(Spam)* 100/SpamData_keyCount$freq[which(SpamData_keyCount$key== key )] ,
TOTAL_COMMENTS = SpamData_keyCount$freq[which(SpamData_keyCount$key== key )])
# Get Data for one video overt the period of dataset
SpamData_text[order(SpamData_text$DAY),] %>%
subset(key== "LMFAO") %>%
ggplot(mapping=aes(x=DAY, y=HOUR,color= CLASS)) +
geom_point()+
xlab("Month")# Run the same check on FUll Data Set now
# Its very clear here that during 1st half 2015 spam were higher than the regular comments in the months.
SpamData_text[order(SpamData_text$DAY),] %>%
ggplot(mapping=aes(x=DATE, y=HOUR,color= CLASS,na.rm = FALSE),na.rm = TRUE) +
geom_point(na.rm = TRUE,alpha= 1/3) +
geom_smooth(na.rm = TRUE,span = 0.1) +
labs(title =" Type of comments on Videos by Date and \n Hour of the day ", x= "DATE" , y ="Hour") +
theme(plot.title = element_text(color="skyblue", size=12, face="bold" ))## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# From Above char its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.# store the mean over year
myear<- mean(summary(as.factor(replace_na(as.character(SpamData_text$YEAR),replace = "9999"))))
mYear2 <- mean(summary(as.factor(replace_na(as.character(SpamData_text$YEAR),replace = "9999")))[c("2014","2013","9999")])
################################## Hisogram showing same trend of more SPAM in incresing trend close to mean of all the comments over Month
ggplot(SpamData_text,mapping = aes(x=DATE, fill = CLASS)) +
# start your data from 0 coordinates
# bin creates 10 group of data .. deault R will get some random bin in this case it was 30 )
geom_histogram(na.rm= TRUE,boundary = 0,
bins = 10
) +
geom_hline(linetype = 5,
color="blue",
yintercept = myear) +
# Adding Annotation
annotate("text",label = sprintf("mean of comments over year %s",myear), x= as.POSIXct("2014-01-25 18:08:03"), y= 535 ) +
geom_hline(linetype = 2,
color="red",
yintercept = mYear2) +
annotate("text",label = sprintf("mean of comments till 2014, %.2f",mYear2), x= as.POSIXct("2014-01-25 18:08:03"), y= mYear2+20 )+
labs(title =" Type of comments on Videos by \n Year ", x= "DATE" , y ="Comments Count") +
theme(plot.title = element_text(color="Red", size=12, face="bold" ))library(plyr)
summary(SpamData_text[SpamData_text$YEAR == "2015",]$YEAR)## 2013 2014 2015 NA's
## 0 0 1082 245
#................................Data points indicates that people have commented more during NIGHT Times , but its little slow during early days of work.
# Mean time of comment is 12:33 during day.
xm <- as.numeric(summary(SpamData_text$HOUR)[4])
plyr::count(SpamData_text,c("HOUR","CLASS")) %>%
ggplot(mapping = aes(x=HOUR,y= freq, fill=CLASS)) +
geom_col(na.rm = TRUE) +
## # 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
geom_vline(na.rm = TRUE,linetype = 5,
color="blue",
xintercept =as.numeric(summary(SpamData_text$HOUR)[4])) +
theme(panel.background = element_blank(),
legend.key = element_blank()) +
# Adding Annotation
annotate("text",label = sprintf("mean of Time %.2f",as.numeric(summary(SpamData_text$HOUR)[4])), x= as.numeric(summary(SpamData_text$HOUR)[4]), y= 100 ) +
labs(title =" Type of Comments made during by Hours of day ", x= "Hour of Day" , y ="Comments Count") +
theme(plot.title = element_text(color="Red", size=12, face="bold" ))## Warning: Removed 1 rows containing missing values (position_stack).
xm <- as.numeric(summary(SpamData_text$HOUR)[4])
plyr::count(SpamData_text,c("HOUR","CLASS")) %>%
ggplot(mapping = aes(x=HOUR,y= freq, color=CLASS)) +
geom_point(na.rm = TRUE) +
geom_smooth(na.rm = TRUE,span = 0.8)## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Indicating Evening to Midnight more comments are created.Our Analysis based on data is as below:
Both Good comments and bad comments are increasing over the year (point 1)
People are spending more time during evening to midnight , but less time during morrning (point 2)
SPAMers target diffrent videos diffrent time of the month . Not same videos would be spammed every day or month .(point 2)
Badwords are equally present in good and spam comments , but are not very big in number. (point 3)
As we can see that spams are there since the time data was collected and it has increased over time . We can clearly see that SPAM is very close to good content for some the videos in year 2014 and mostly all the videos in 2015 .
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_point).
Find SPAM / Comments pattern over year and month and during the hour of day
We can see that as we are moving ahead , SPAM BOX is also increasing. Good point to note that even good comments has increased during 2015
We can clearly see that Spamming is not a regualar process for a Video, it happens in a in some months, with very few exceptions .
incontrast, for multiple videos, Data Point for 2015 indicates that most of the videos were spammed in this year, almost every month some vidoes were spammed.
Hisogram showing same trend of more SPAM in incresing trend close to mean of all the comments over Month * since 2015 data is big we tried plotting meen against all data except 2015 > Is clear that 2015 has added more Good comments and also SPAM
Data points indicates that people have commented more during NIGHT Times , but its little slow during early days of work. > Less comments during morning ? We can say yes.
## Warning: Removed 1 rows containing missing values (position_stack).
*Its very clear here that during 1st half of 2014 and 2015 spam were higher than the rgular comments in the months.
Bulk of 2015 is visble again.
SpamData_text[order(SpamData_text$DAY),] %>%
ggplot(mapping=aes(x=DATE, y=HOUR,color= CLASS,na.rm = FALSE),na.rm = TRUE) +
geom_point(na.rm = TRUE,alpha= 1/3) +
geom_smooth(na.rm = TRUE,span = 0.1) +
labs(title =" Type of comments on Videos by Date and \n Hour of the day ", x= "DATE" , y ="Hour") +
theme(plot.title = element_text(color="skyblue", size=12, face="bold" ))## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# From Above char its clear how Spaming has been progressing and figting with good content . Its Also strange to NOTE that more than 50% of data is SPAM.Below table is very clear and pointing out that Bad words more in SAPM , but strange even not-Spam comments contain bad words.
plyr::count(SpamData_text ,vars=c("CLASS","bad")) %>%
plyr:: rename( c("CLASS"="TYPE","bad"="Bad words used","freq"="Number of Comments"))BONUS - place the original .csv in a github file and have R read from the link.
SPAMdata Master file created from this product is Uploaded to CSV Github , Raw CSV from GIT Hub
gitRawFile = "https://raw.githubusercontent.com/Rajwantmishra/msds/master/SpamData.csv"
#require(XML)
read.csv.url <- read.csv( url(gitRawFile))
head(read.csv.url)#require(read.table)
read.csv.Data <- read.csv(gitRawFile,header=T)
head(read.csv.Data )#library(data.table)
dataTableCSV <- fread(gitRawFile)
head(dataTableCSV)library(tidyverse)
tidyDataCSV <- read_csv(gitRawFile)## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## COMMENT_ID = col_character(),
## AUTHOR = col_character(),
## DATE = col_datetime(format = ""),
## CONTENT = col_character(),
## CLASS = col_double(),
## key = col_character(),
## YID = col_character()
## )
str(tidyDataCSV)## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 2044 obs. of 8 variables:
## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ COMMENT_ID: chr "z12rwfnyyrbsefonb232i5ehdxzkjzjs2" "z130wpnwwnyuetxcn23xf5k5ynmkdpjrj04" "z13vsfqirtavjvu0t22ezrgzyorwxhpf3" "z12wjzc4eprnvja4304cgbbizuved35wxcs" ...
## $ AUTHOR : chr "Lisa Wellas" "jason graham" "Ajkal Khan" "Dakota Taylor" ...
## $ DATE : POSIXct, format: NA "2015-05-29 02:26:10" ...
## $ CONTENT : chr "+447935454150 lovely girl talk to me xxx<U+FEFF>" "I always end up coming back to this song<br /><U+FEFF>" "my sister just received over 6,500 new <a rel=\"nofollow\" class=\"ot-hashtag\" href=\"https://plus.google.com/"| __truncated__ "Cool<U+FEFF>" ...
## $ CLASS : num 1 0 1 0 1 0 1 0 1 0 ...
## $ key : chr "Eminem" "Eminem" "Eminem" "Eminem" ...
## $ YID : chr "uelHwf8o7_U" "uelHwf8o7_U" "uelHwf8o7_U" "uelHwf8o7_U" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. COMMENT_ID = col_character(),
## .. AUTHOR = col_character(),
## .. DATE = col_datetime(format = ""),
## .. CONTENT = col_character(),
## .. CLASS = col_double(),
## .. key = col_character(),
## .. YID = col_character()
## .. )
head(tidyDataCSV)