Used different file imputs to practice on (xlsx, csv, xml)
library(rio)
library(stringr)
library(tidyr)
library(dplyr)
library(XML)
#convert("datasets/edu-25plus.xls","datasets/edu-25plus.csv")
convert("datasets/candy.xlsx","datasets/candy.csv")
## New names:
## * `` -> ...114
importCandyData <- read.csv("datasets/candy.csv", na.strings = "")
tmpCandyData <- importCandyData[, c(3:109)]
candyColNames <- names(tmpCandyData)
candyColNames <- substr(candyColNames, 5, nchar(candyColNames))
candyColNames <- str_trim(str_replace_all(candyColNames, "\\.", " "), side = "left")
colnames(tmpCandyData)<- candyColNames
tmpCandyData2 <- tmpCandyData[, c(1,2,14:16,18,22,26)] #sample size candybar data
tmpCandyData3 <- drop_na(tmpCandyData2)
tmpCandyData4 <- transform(tmpCandyData3, AGE = as.integer(AGE))
tmpCandyData5 <- data.frame(lapply(tmpCandyData4, as.character), stringsAsFactors=FALSE)
importAirbnb <- read.csv("datasets/AB_NYC_2019.csv", na.strings = "", header = TRUE)
tmpAirbnb <- drop_na(importAirbnb)
Note: Only using Section Reference to parse data
importEdFi <- xmlToDataFrame("datasets/StudentEnrollment.xml")
tmpEdFi <- importEdFi[,c(1,6)]
tmpEdFi <- drop_na(tmpEdFi)
#SectionNumber
sectionNum <- as.numeric(str_extract_all(tmpEdFi$SectionReference, "[[:digit:]]{9}"))
#SubjectType
tmpsubjType <- as.character(str_extract(tmpEdFi$SectionReference, "[[:alpha:]]{2,5}\\-?([[:digit:]]{1}|[[:alpha:]]{2})"))
subjType <- str_extract(tmpEdFi$SectionReference, "[[:alpha:]]{2,}")
#SchoolYear
schoolYear <- as.character(str_extract(tmpEdFi$SectionReference, "([[:digit:]]{4}\\-[[:digit:]]{4})"))
#Semester
tmpSemester <- as.character(str_extract(tmpEdFi$SectionReference, "[[:alpha:]]{4,6}[[:blank:]]+[[:alpha:]]{8}"))
#construct df
studentEnrollment <- data.frame(tmpEdFi$StudentReference,sectionNum,subjType,schoolYear,tmpSemester)