loading-large-data-sets-and-breaking-them-into-chunks.R

readLines("C:\\Users\\xholi\\OneDrive\\Desktop\\ratings.csv", n=7,)

## [1] "userId,movieId,rating,timestamp" "1,110,1.0,1425941529"           
## [3] "1,147,4.5,1425942435"            "1,858,5.0,1425941523"           
## [5] "1,1221,5.0,1425941546"           "1,1246,5.0,1425941556"          
## [7] "1,1968,4.0,1425942148"

Movies_Ratings <-("C:\\Users\\xholi\\OneDrive\\Desktop\\ratings.csv")

pacman::p_load(pacman,dplyr,GGally,ggplot2,ggthemes,
               ggvis,rio,rmarkdown,shiny,stringr,tidyr,plotly,
               lubridate)


###VIEWING THE DATA IN CHUCKS###

chunkSize <- 100000   #### THIS TELLS r THAT THE SIZE OF YOUR SUBSET SHOULD HAVE 100000 ROWS##
con <- file(description = Movies_Ratings,open = "r")
data <- read.table(con,nrows = chunkSize,header = TRUE,fill = TRUE,sep = ",")
close(con)
head(data)

##   userId movieId rating  timestamp
## 1      1     110    1.0 1425941529
## 2      1     147    4.5 1425942435
## 3      1     858    5.0 1425941523
## 4      1    1221    5.0 1425941546
## 5      1    1246    5.0 1425941556
## 6      1    1968    4.0 1425942148

hist(data$rating)

summary(data$rating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.500   3.000   3.500   3.527   4.000   5.000

hist(data$movieId)

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

describe(data)

##           vars     n         mean           sd       median      trimmed
## userId       1 1e+05 5.255800e+02       285.01        528.0 5.306800e+02
## movieId      2 1e+05 1.523713e+04     30440.56       2571.0 7.091160e+03
## rating       3 1e+05 3.530000e+00         1.02          3.5 3.600000e+00
## timestamp    4 1e+05 1.163922e+09 202488432.95 1146225970.0 1.162446e+09
##                    mad         min        max       range  skew kurtosis
## userId          351.38         1.0       1014      1013.0 -0.11    -1.14
## movieId        2928.13         1.0     176271    176270.0  2.46     5.53
## rating            0.74         0.5          5         4.5 -0.60     0.13
## timestamp 249979173.37 827947236.0 1501822366 673875130.0  0.18    -1.16
##                  se
## userId         0.90
## movieId       96.26
## rating         0.00
## timestamp 640324.65

###creating an index and looking throught the entire data chunk by chunk####

index<- 0
chunkSize <- 100
con <-file(description = Movies_Ratings,open = "r")
dataChunk <- read.table(con,nrows = chunkSize,header = TRUE,sep = ",")
actualColumnNames <- names(dataChunk)  ###this converts the column names to its original names##
counter <- 0        ###this tell R to iterate from 0 to the last index####
rating <-0      ####this is the variable we selected to analyse in the data table from the 12 varibles in the data table##

##tells the R to repeat the above code with increments of 1 till the entire data frame is corved###
repeat{
  index <- index +1
  print(paste("Processing rows:",index*chunkSize))
  
  rating <- rating + sum(dataChunk$rating)
  counter<- counter + nrow(dataChunk)
  
  if (nrow(dataChunk)!=chunkSize){
    print("Processed all files!")
    break}
  dataChunk<-read.table(con,nrows = chunkSize,skip = 0, header = TRUE, fill = TRUE,sep = ",",
                        col.names = actualColumnNames) 
  break }

## [1] "Processing rows: 100"

close(con)
print(paste0("Ratings mean:",rating/counter,"$"))

## [1] "Ratings mean:3.665$"

loading-large-data-sets-and-breaking-them-into-chunks.R

xholi

2020-12-07