POMEGRANATE 2017 DATA

  1. Let us save the folder as:
folder <- "C:/Users/MAHESH/Desktop/First_Project/DATA/Pomegranate"
  1. Now, let us save the list of files which are of .csv in the Pomegranate folder as pomo
pomo <- list.files(path=folder, pattern="*.csv");pomo
##  [1] "201701_Pomo.csv" "201702_Pomo.csv" "201703_Pomo.csv"
##  [4] "201704_Pomo.csv" "201705_Pomo.csv" "201706_Pomo.csv"
##  [7] "201707_Pomo.csv" "201708_Pomo.csv" "201709_Pomo.csv"
## [10] "201710_Pomo.csv" "201711_Pomo.csv" "201712_Pomo.csv"
  1. Create a folder named Result.
dir.create("Result")
  1. Check if the folder has been created.
dir()
## [1] "DATA"                "First_Project.Rproj" "My_Markdown.pdf"    
## [4] "My_Markdown.Rmd"     "Result"              "Sorting_Script.R"   
## [7] "Workspace"
  1. Now assign the first file in the folder Pomegranate as ‘a’
a<-read.csv(paste(folder, pomo[1], sep='/'));a
By the end of this session we will clean this data to look like,
  1. From the 2nd row we have to get month and year, for that we have to use ‘stringr’ package.
library(stringr)
dm<-str_split_fixed(a[2,1]," : ",2)
dm1<-str_split_fixed(dm[2]," - ",2)
month<-dm1[1];month
## [1] "January"
year<-as.numeric(dm1[2]);year
## [1] 2017
  1. Now we get crop and it’s catogary information and assign them to ‘crop’ and ‘cato’
cr1<-str_split_fixed(a[4,1]," : ",2)
cr2<-str_split_fixed(a[3,1]," : ",2)
crop<-cr1[2];crop
## [1] "POMEGRANATE"
cato<-cr2[2];cato
## [1] "FRUITS"
  1. As we observe from the above data, numerical values starts to occur from 10th row to 14th row and from the 5th column i.e., x.3 to the last column. Now, let us sort this part of data and assign it to ‘l’. ‘l’ here has converted from data frame to the matrix and we can transpose it, call it as ‘m’
l<-a[10:14,5:ncol(a)]
m<-t(l)
head(m)
##     10     11     12     13      14   
## X.3 ""     ""     ""     ""      ""   
## X.4 "5000" "6500" "6000" "9500"  "274"
## X.5 "5000" "6500" "6000" "9500"  "139"
## X.6 "4500" "6500" "5500" "9500"  "149"
## X.7 "5500" "6500" "6000" "9500"  "179"
## X.8 "4500" "6500" "5500" "10000" "179"
  1. The numbers are in the ‘character’ form. Converting it into numeric form.
n<-as.numeric(m);n
##   [1]    NA  5000  5000  4500  5500  4500  4500    NA  5500  4000  4500
##  [12]  4000  5500    NA    NA  4000  5500  5500  5500  5500  5500    NA
##  [23]  5500  5500  5000    NA  5500  5000    NA  5000  5500    NA  6500
##  [34]  6500  6500  6500  6500  6500    NA  6500  7000  6500  6000  7500
##  [45]    NA    NA  5000  6500  6500  6500  6500  6500    NA  6500  6500
##  [56]  7000    NA  6500  6500    NA  7000  7500    NA  6000  6000  5500
##  [67]  6000  5500  5500    NA  6000  6000  5500  5000  6500    NA    NA
##  [78]  4500  6000  6000  6000  6000  6000    NA  6000  6000  6000    NA
##  [89]  6000  5400    NA  6000  6500    NA  9500  9500  9500  9500 10000
## [100]  9500    NA  9500 10000  9500  9000 10000    NA    NA  9000  9500
## [111]  9500 10000 10000  9500    NA  9500  9500 10000    NA  9500 10000
## [122]    NA 10000 10000    NA   274   139   149   179   179   130    NA
## [133]   183   107   126    82    67    NA    NA   131    93   150   109
## [144]   157   147    NA   186   218   117    NA   120    58    NA   160
## [155]   115
  1. As soon as we convert the data into numeric the data structure is lost, but no problem, we can bring it back to matrix again using matrix() function, call it as ‘o’
o<-matrix(n,nrow(m),5);head(o,4)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]   NA   NA   NA   NA   NA
## [2,] 5000 6500 6000 9500  274
## [3,] 5000 6500 6000 9500  139
## [4,] 4500 6500 5500 9500  149
  1. Converting o into data frame,
p<-data.frame(o);p
  1. Assigning column names by using the column 4 from a,
names(p)<-a[10:14,4];p
  1. Assigning row values in the form ‘date-month-year’
row.names(p)<-paste0(1:nrow(m),"-", month,"-", year);p
  1. I have used if and else if statement to sort data in days for that perticular month and year(for leap year)
if (month == "April"|month =="June"|month =="September"|month=="November"){
          a1<-a1[1:30,]
        } else if (month == "February" & year == 2000
                   |year==2004|year==2008|year==2012|year==2016|year==2020){
          a1<-a1[1:29,]
        } else {
          a1<-a1[1:28,]
        }
  1. Lets create the file name and save it as ,csv
myfile<-paste0("Result","/",year,"_",month,"_",cato,"_",crop,".csv")
write.csv(a1,myfile)
  1. I have made a for loop which will clean all the data in the folder and saves the files in the Result folder. The code which I am giving doesn’t have many variables like l, m, n, o, p and hence doesn’t pile up in RAM.
folder <- "C:/Users/MAHESH/Desktop/First_Project/DATA/Pomegranate"
pomo <- list.files(path=folder, pattern="*.csv")

dir.create("Result")
dir()
## [1] "DATA"                "First_Project.Rproj" "My_Markdown.pdf"    
## [4] "My_Markdown.Rmd"     "Result"              "Sorting_Script.R"   
## [7] "Workspace"
fil<-dir()

library(stringr)

a2<-data.frame(ncol(5))

for (i in 1:length(pomo)){
    a<-read.csv(paste(folder, pomo[i], sep='/'))

dm<-str_split_fixed(a[2,1]," : ",2)
dm1<-str_split_fixed(dm[2]," - ",2)
cr1<-str_split_fixed(a[4,1]," : ",2)
cr2<-str_split_fixed(a[3,1]," : ",2)
lo<-str_split_fixed(a[10,2]," / ",2)

month<-dm1[1]
year<-as.numeric(dm1[2])
crop<-cr1[2] 
cato<-cr2[2]
city<-lo[1]

a1<-data.frame(matrix(as.numeric(t(a[10:14,5:ncol(a)])),
                      nrow(t(a[10:14,5:ncol(a)])),5))

row.names(a1)<-paste0(1:nrow(t(a[10:14,5:ncol(a)])),"-", month,"-", year)

        if (month == "April"|month =="June"|month =="September"|month=="November"){
          a1<-a1[1:30,]
        } else if (month == "February" & year == 2000|year==2004
                   |year==2008|year==2012|year==2016|year==2020){
          a1<-a1[1:29,]
        } else if (month == "January"|month =="March"
                   |month =="May"|month=="July"
                   |month=="August"|month=="October"|month=="December"){
          a1<-a1
        } else {
          a1<-a1[1:28,]
        }


a2<-rbind(a2,a1)

}

names(a2)<-a[10:14,4]

myfile<-paste0("Result","/",year,"_",city,"_",cato,"_",crop,".csv")
write.csv(a2,myfile)