readLines("C:\\Users\\xholi\\OneDrive\\Desktop\\ratings.csv", n=7,)
## [1] "userId,movieId,rating,timestamp" "1,110,1.0,1425941529"
## [3] "1,147,4.5,1425942435" "1,858,5.0,1425941523"
## [5] "1,1221,5.0,1425941546" "1,1246,5.0,1425941556"
## [7] "1,1968,4.0,1425942148"
Movies_Ratings <-("C:\\Users\\xholi\\OneDrive\\Desktop\\ratings.csv")
pacman::p_load(pacman,dplyr,GGally,ggplot2,ggthemes,
ggvis,rio,rmarkdown,shiny,stringr,tidyr,plotly,
lubridate)
###VIEWING THE DATA IN CHUCKS###
chunkSize <- 100000 #### THIS TELLS r THAT THE SIZE OF YOUR SUBSET SHOULD HAVE 100000 ROWS##
con <- file(description = Movies_Ratings,open = "r")
data <- read.table(con,nrows = chunkSize,header = TRUE,fill = TRUE,sep = ",")
close(con)
head(data)
## userId movieId rating timestamp
## 1 1 110 1.0 1425941529
## 2 1 147 4.5 1425942435
## 3 1 858 5.0 1425941523
## 4 1 1221 5.0 1425941546
## 5 1 1246 5.0 1425941556
## 6 1 1968 4.0 1425942148
hist(data$rating)

summary(data$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.500 3.000 3.500 3.527 4.000 5.000
hist(data$movieId)

library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(data)
## vars n mean sd median trimmed
## userId 1 1e+05 5.255800e+02 285.01 528.0 5.306800e+02
## movieId 2 1e+05 1.523713e+04 30440.56 2571.0 7.091160e+03
## rating 3 1e+05 3.530000e+00 1.02 3.5 3.600000e+00
## timestamp 4 1e+05 1.163922e+09 202488432.95 1146225970.0 1.162446e+09
## mad min max range skew kurtosis
## userId 351.38 1.0 1014 1013.0 -0.11 -1.14
## movieId 2928.13 1.0 176271 176270.0 2.46 5.53
## rating 0.74 0.5 5 4.5 -0.60 0.13
## timestamp 249979173.37 827947236.0 1501822366 673875130.0 0.18 -1.16
## se
## userId 0.90
## movieId 96.26
## rating 0.00
## timestamp 640324.65
###creating an index and looking throught the entire data chunk by chunk####
index<- 0
chunkSize <- 100
con <-file(description = Movies_Ratings,open = "r")
dataChunk <- read.table(con,nrows = chunkSize,header = TRUE,sep = ",")
actualColumnNames <- names(dataChunk) ###this converts the column names to its original names##
counter <- 0 ###this tell R to iterate from 0 to the last index####
rating <-0 ####this is the variable we selected to analyse in the data table from the 12 varibles in the data table##
##tells the R to repeat the above code with increments of 1 till the entire data frame is corved###
repeat{
index <- index +1
print(paste("Processing rows:",index*chunkSize))
rating <- rating + sum(dataChunk$rating)
counter<- counter + nrow(dataChunk)
if (nrow(dataChunk)!=chunkSize){
print("Processed all files!")
break}
dataChunk<-read.table(con,nrows = chunkSize,skip = 0, header = TRUE, fill = TRUE,sep = ",",
col.names = actualColumnNames)
break }
## [1] "Processing rows: 100"
close(con)
print(paste0("Ratings mean:",rating/counter,"$"))
## [1] "Ratings mean:3.665$"