0103 : Components of a tidy data
raw data
tidy data (this is the target)
code book
instruction list
0104 : downloading files
setwd("C:/Users/krith/OneDrive/Documents/coursera/getting_cleaning_data/week1")
if(!file.exists("data")){
dir.create("data")
}
getting data from internet
download.file(url,destfile,method)
url -> “https://data.baltimorecity.gov/Transportation/Baltimore-Fixed-Speed-Cameras/dz54-2aru”
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile = "./data/cameras.csv",method = "curl")
list.files("./data/")
## [1] "cameras.csv" "cameras.xlsx" "simple.xml"
dateDownloaded <- date()
dateDownloaded
## [1] "Tue Apr 04 04:55:42 2017"
0105 : reading local flat files
csv download
read Baltimore camera data downloaded
cameraData <- read.table("./data/cameras.csv",sep=",",header = TRUE)
head(cameraData)
## address direction street crossStreet
## 1 S CATON AVE & BENSON AVE N/B Caton Ave Benson Ave
## 2 S CATON AVE & BENSON AVE S/B Caton Ave Benson Ave
## 3 WILKENS AVE & PINE HEIGHTS AVE E/B Wilkens Ave Pine Heights
## 4 THE ALAMEDA & E 33RD ST S/B The Alameda 33rd St
## 5 E 33RD ST & THE ALAMEDA E/B E 33rd The Alameda
## 6 ERDMAN AVE & N MACON ST E/B Erdman Macon St
## intersection Location.1
## 1 Caton Ave & Benson Ave (39.2693779962, -76.6688185297)
## 2 Caton Ave & Benson Ave (39.2693157898, -76.6689698176)
## 3 Wilkens Ave & Pine Heights (39.2720252302, -76.676960806)
## 4 The Alameda & 33rd St (39.3285013141, -76.5953545714)
## 5 E 33rd & The Alameda (39.3283410623, -76.5953594625)
## 6 Erdman & Macon St (39.3068045671, -76.5593167803)
important parameters you can add
0106 : read excel files
if(!file.exists("data")){
dir.create("data")
}
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD&bom=true"
download.file(fileUrl,destfile = "./data/cameras.xlsx",method = "curl")
dateDownloaded<-date()
dateDownloaded
## [1] "Tue Apr 04 04:55:42 2017"
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 3.3.3
# cameraData <- read.xlsx("./data/cameras.xlsx",sheetIndex=1,header=TRUE)
sheet index is to mention which sheet the data is from
# head(cameraData)
# this xlsx file is corrupted
It is advised to store data in a DATABASE/CSV/.tab/.txt as it is easy to share and access by a lot of people
0107 : read XML file
library(XML)
## Warning: package 'XML' was built under R version 3.3.3
fileUrl <- "http://www.xmlfiles.com/examples/simple.xml"
doc <- xmlTreeParse(fileUrl,useInternal=TRUE)
rootnode <- xmlRoot(doc) # wrapper for the entire document
xmlName(rootnode) # name of root node
## [1] "breakfast-menu"
names(rootnode)
## food food food food food
## "food" "food" "food" "food" "food"
rootnode[[1]]
## <food>
## <name>Belgian Waffles</name>
## <price>$5.95</price>
## <description>two of our famous Belgian Waffles with plenty of real maple syrup</description>
## <calories>650</calories>
## </food>
# extract from scratch
rootnode[[1]][[1]]
## <name>Belgian Waffles</name>
# programmatically extract
xmlApply(rootnode,xmlValue)
## $food
## [1] "Belgian Waffles$5.95two of our famous Belgian Waffles with plenty of real maple syrup650"
##
## $food
## [1] "Strawberry Belgian Waffles$7.95light Belgian waffles covered with strawberrys and whipped cream900"
##
## $food
## [1] "Berry-Berry Belgian Waffles$8.95light Belgian waffles covered with an assortment of fresh berries and whipped cream900"
##
## $food
## [1] "French Toast$4.50thick slices made from our homemade sourdough bread600"
##
## $food
## [1] "Homestyle Breakfast$6.95two eggs, bacon or sausage, toast, and our ever-popular hash browns950"
calories <- xpathSApply(rootnode,"//calories",xmlValue)
calories
## [1] "650" "900" "900" "600" "950"
length(calories[calories==650])
## [1] 1
// represents any node
another example
http://www.espn.com/nfl/team/_/name/bal/baltimore-ravens
fileUrl <-"http://www.espn.com/nfl/team/_/name/bal/baltimore-ravens"
doc <- htmlTreeParse(fileUrl,useInternal=TRUE) # using internal = T as we want all nodes inside that
scores <- xpathSApply(doc,"//li[@class='score']",xmlValue)
scores
## list()
teams = xpathSApply(doc,"//li[@class='team-name']",xmlValue)
teams
## [1] "Baltimore RavensRavens" "Baltimore RavensRavens"
rankings = xpathSApply(doc,"//li[@class='ranking']",xmlValue)
rankings
## [1] "2nd in AFC North" "2nd in AFC North"
0108 : Reading JSON
# install.packages("jsonlite")
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.3.3
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
length(names(jsonData))
## [1] 69
names(jsonData) # names of objects in json
## [1] "id" "name" "full_name"
## [4] "owner" "private" "html_url"
## [7] "description" "fork" "url"
## [10] "forks_url" "keys_url" "collaborators_url"
## [13] "teams_url" "hooks_url" "issue_events_url"
## [16] "events_url" "assignees_url" "branches_url"
## [19] "tags_url" "blobs_url" "git_tags_url"
## [22] "git_refs_url" "trees_url" "statuses_url"
## [25] "languages_url" "stargazers_url" "contributors_url"
## [28] "subscribers_url" "subscription_url" "commits_url"
## [31] "git_commits_url" "comments_url" "issue_comment_url"
## [34] "contents_url" "compare_url" "merges_url"
## [37] "archive_url" "downloads_url" "issues_url"
## [40] "pulls_url" "milestones_url" "notifications_url"
## [43] "labels_url" "releases_url" "deployments_url"
## [46] "created_at" "updated_at" "pushed_at"
## [49] "git_url" "ssh_url" "clone_url"
## [52] "svn_url" "homepage" "size"
## [55] "stargazers_count" "watchers_count" "language"
## [58] "has_issues" "has_projects" "has_downloads"
## [61] "has_wiki" "has_pages" "forks_count"
## [64] "mirror_url" "open_issues_count" "forks"
## [67] "open_issues" "watchers" "default_branch"
names(jsonData$owner) # name sof nested objects in json
## [1] "login" "id" "avatar_url"
## [4] "gravatar_id" "url" "html_url"
## [7] "followers_url" "following_url" "gists_url"
## [10] "starred_url" "subscriptions_url" "organizations_url"
## [13] "repos_url" "events_url" "received_events_url"
## [16] "type" "site_admin"
jsonData$owner$login
## [1] "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek"
## [8] "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek"
## [15] "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek"
## [22] "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek" "jtleek"
## [29] "jtleek" "jtleek"
writing data frames to JSON
myjson <-toJSON(iris,pretty=T) # if we require json formatted data, pretty makes it indented
cat(myjson) # since its not text variable, we use cat to print the json variable
## [
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.5,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.9,
## "Sepal.Width": 3,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.7,
## "Sepal.Width": 3.2,
## "Petal.Length": 1.3,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.6,
## "Sepal.Width": 3.1,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.6,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.4,
## "Sepal.Width": 3.9,
## "Petal.Length": 1.7,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.6,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.4,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.4,
## "Sepal.Width": 2.9,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.9,
## "Sepal.Width": 3.1,
## "Petal.Length": 1.5,
## "Petal.Width": 0.1,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.4,
## "Sepal.Width": 3.7,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.8,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.6,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.8,
## "Sepal.Width": 3,
## "Petal.Length": 1.4,
## "Petal.Width": 0.1,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.3,
## "Sepal.Width": 3,
## "Petal.Length": 1.1,
## "Petal.Width": 0.1,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 4,
## "Petal.Length": 1.2,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 4.4,
## "Petal.Length": 1.5,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.4,
## "Sepal.Width": 3.9,
## "Petal.Length": 1.3,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.5,
## "Petal.Length": 1.4,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 3.8,
## "Petal.Length": 1.7,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.8,
## "Petal.Length": 1.5,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.4,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.7,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.7,
## "Petal.Length": 1.5,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.6,
## "Sepal.Width": 3.6,
## "Petal.Length": 1,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.3,
## "Petal.Length": 1.7,
## "Petal.Width": 0.5,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.8,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.9,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3,
## "Petal.Length": 1.6,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.6,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.2,
## "Sepal.Width": 3.5,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.2,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.7,
## "Sepal.Width": 3.2,
## "Petal.Length": 1.6,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.8,
## "Sepal.Width": 3.1,
## "Petal.Length": 1.6,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.4,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.5,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.2,
## "Sepal.Width": 4.1,
## "Petal.Length": 1.5,
## "Petal.Width": 0.1,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 4.2,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.9,
## "Sepal.Width": 3.1,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.2,
## "Petal.Length": 1.2,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 3.5,
## "Petal.Length": 1.3,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.9,
## "Sepal.Width": 3.6,
## "Petal.Length": 1.4,
## "Petal.Width": 0.1,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.4,
## "Sepal.Width": 3,
## "Petal.Length": 1.3,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.4,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.5,
## "Petal.Length": 1.3,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.5,
## "Sepal.Width": 2.3,
## "Petal.Length": 1.3,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.4,
## "Sepal.Width": 3.2,
## "Petal.Length": 1.3,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.5,
## "Petal.Length": 1.6,
## "Petal.Width": 0.6,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.8,
## "Petal.Length": 1.9,
## "Petal.Width": 0.4,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.8,
## "Sepal.Width": 3,
## "Petal.Length": 1.4,
## "Petal.Width": 0.3,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 3.8,
## "Petal.Length": 1.6,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 4.6,
## "Sepal.Width": 3.2,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5.3,
## "Sepal.Width": 3.7,
## "Petal.Length": 1.5,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 3.3,
## "Petal.Length": 1.4,
## "Petal.Width": 0.2,
## "Species": "setosa"
## },
## {
## "Sepal.Length": 7,
## "Sepal.Width": 3.2,
## "Petal.Length": 4.7,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 3.2,
## "Petal.Length": 4.5,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.9,
## "Sepal.Width": 3.1,
## "Petal.Length": 4.9,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 2.3,
## "Petal.Length": 4,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.5,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.6,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.5,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 3.3,
## "Petal.Length": 4.7,
## "Petal.Width": 1.6,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 4.9,
## "Sepal.Width": 2.4,
## "Petal.Length": 3.3,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.6,
## "Sepal.Width": 2.9,
## "Petal.Length": 4.6,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.2,
## "Sepal.Width": 2.7,
## "Petal.Length": 3.9,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 2,
## "Petal.Length": 3.5,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.9,
## "Sepal.Width": 3,
## "Petal.Length": 4.2,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6,
## "Sepal.Width": 2.2,
## "Petal.Length": 4,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.1,
## "Sepal.Width": 2.9,
## "Petal.Length": 4.7,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.6,
## "Sepal.Width": 2.9,
## "Petal.Length": 3.6,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3.1,
## "Petal.Length": 4.4,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.6,
## "Sepal.Width": 3,
## "Petal.Length": 4.5,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 2.7,
## "Petal.Length": 4.1,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.2,
## "Sepal.Width": 2.2,
## "Petal.Length": 4.5,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.6,
## "Sepal.Width": 2.5,
## "Petal.Length": 3.9,
## "Petal.Width": 1.1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.9,
## "Sepal.Width": 3.2,
## "Petal.Length": 4.8,
## "Petal.Width": 1.8,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.1,
## "Sepal.Width": 2.8,
## "Petal.Length": 4,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 2.5,
## "Petal.Length": 4.9,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.1,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.7,
## "Petal.Width": 1.2,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 2.9,
## "Petal.Length": 4.3,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.6,
## "Sepal.Width": 3,
## "Petal.Length": 4.4,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.8,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.8,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3,
## "Petal.Length": 5,
## "Petal.Width": 1.7,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6,
## "Sepal.Width": 2.9,
## "Petal.Length": 4.5,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 2.6,
## "Petal.Length": 3.5,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 2.4,
## "Petal.Length": 3.8,
## "Petal.Width": 1.1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 2.4,
## "Petal.Length": 3.7,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 2.7,
## "Petal.Length": 3.9,
## "Petal.Width": 1.2,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6,
## "Sepal.Width": 2.7,
## "Petal.Length": 5.1,
## "Petal.Width": 1.6,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.4,
## "Sepal.Width": 3,
## "Petal.Length": 4.5,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6,
## "Sepal.Width": 3.4,
## "Petal.Length": 4.5,
## "Petal.Width": 1.6,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3.1,
## "Petal.Length": 4.7,
## "Petal.Width": 1.5,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 2.3,
## "Petal.Length": 4.4,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.6,
## "Sepal.Width": 3,
## "Petal.Length": 4.1,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 2.5,
## "Petal.Length": 4,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.5,
## "Sepal.Width": 2.6,
## "Petal.Length": 4.4,
## "Petal.Width": 1.2,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.1,
## "Sepal.Width": 3,
## "Petal.Length": 4.6,
## "Petal.Width": 1.4,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 2.6,
## "Petal.Length": 4,
## "Petal.Width": 1.2,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5,
## "Sepal.Width": 2.3,
## "Petal.Length": 3.3,
## "Petal.Width": 1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.6,
## "Sepal.Width": 2.7,
## "Petal.Length": 4.2,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 3,
## "Petal.Length": 4.2,
## "Petal.Width": 1.2,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 2.9,
## "Petal.Length": 4.2,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.2,
## "Sepal.Width": 2.9,
## "Petal.Length": 4.3,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.1,
## "Sepal.Width": 2.5,
## "Petal.Length": 3,
## "Petal.Width": 1.1,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.1,
## "Petal.Width": 1.3,
## "Species": "versicolor"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 3.3,
## "Petal.Length": 6,
## "Petal.Width": 2.5,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 2.7,
## "Petal.Length": 5.1,
## "Petal.Width": 1.9,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.1,
## "Sepal.Width": 3,
## "Petal.Length": 5.9,
## "Petal.Width": 2.1,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 2.9,
## "Petal.Length": 5.6,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.5,
## "Sepal.Width": 3,
## "Petal.Length": 5.8,
## "Petal.Width": 2.2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.6,
## "Sepal.Width": 3,
## "Petal.Length": 6.6,
## "Petal.Width": 2.1,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 4.9,
## "Sepal.Width": 2.5,
## "Petal.Length": 4.5,
## "Petal.Width": 1.7,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.3,
## "Sepal.Width": 2.9,
## "Petal.Length": 6.3,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 2.5,
## "Petal.Length": 5.8,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.2,
## "Sepal.Width": 3.6,
## "Petal.Length": 6.1,
## "Petal.Width": 2.5,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.5,
## "Sepal.Width": 3.2,
## "Petal.Length": 5.1,
## "Petal.Width": 2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 2.7,
## "Petal.Length": 5.3,
## "Petal.Width": 1.9,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.8,
## "Sepal.Width": 3,
## "Petal.Length": 5.5,
## "Petal.Width": 2.1,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 5.7,
## "Sepal.Width": 2.5,
## "Petal.Length": 5,
## "Petal.Width": 2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 2.8,
## "Petal.Length": 5.1,
## "Petal.Width": 2.4,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 3.2,
## "Petal.Length": 5.3,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.5,
## "Sepal.Width": 3,
## "Petal.Length": 5.5,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.7,
## "Sepal.Width": 3.8,
## "Petal.Length": 6.7,
## "Petal.Width": 2.2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.7,
## "Sepal.Width": 2.6,
## "Petal.Length": 6.9,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6,
## "Sepal.Width": 2.2,
## "Petal.Length": 5,
## "Petal.Width": 1.5,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.9,
## "Sepal.Width": 3.2,
## "Petal.Length": 5.7,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 5.6,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.9,
## "Petal.Width": 2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.7,
## "Sepal.Width": 2.8,
## "Petal.Length": 6.7,
## "Petal.Width": 2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 2.7,
## "Petal.Length": 4.9,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3.3,
## "Petal.Length": 5.7,
## "Petal.Width": 2.1,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.2,
## "Sepal.Width": 3.2,
## "Petal.Length": 6,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.2,
## "Sepal.Width": 2.8,
## "Petal.Length": 4.8,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.1,
## "Sepal.Width": 3,
## "Petal.Length": 4.9,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 2.8,
## "Petal.Length": 5.6,
## "Petal.Width": 2.1,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.2,
## "Sepal.Width": 3,
## "Petal.Length": 5.8,
## "Petal.Width": 1.6,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.4,
## "Sepal.Width": 2.8,
## "Petal.Length": 6.1,
## "Petal.Width": 1.9,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.9,
## "Sepal.Width": 3.8,
## "Petal.Length": 6.4,
## "Petal.Width": 2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 2.8,
## "Petal.Length": 5.6,
## "Petal.Width": 2.2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 2.8,
## "Petal.Length": 5.1,
## "Petal.Width": 1.5,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.1,
## "Sepal.Width": 2.6,
## "Petal.Length": 5.6,
## "Petal.Width": 1.4,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 7.7,
## "Sepal.Width": 3,
## "Petal.Length": 6.1,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 3.4,
## "Petal.Length": 5.6,
## "Petal.Width": 2.4,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.4,
## "Sepal.Width": 3.1,
## "Petal.Length": 5.5,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6,
## "Sepal.Width": 3,
## "Petal.Length": 4.8,
## "Petal.Width": 1.8,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.9,
## "Sepal.Width": 3.1,
## "Petal.Length": 5.4,
## "Petal.Width": 2.1,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3.1,
## "Petal.Length": 5.6,
## "Petal.Width": 2.4,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.9,
## "Sepal.Width": 3.1,
## "Petal.Length": 5.1,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 5.8,
## "Sepal.Width": 2.7,
## "Petal.Length": 5.1,
## "Petal.Width": 1.9,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.8,
## "Sepal.Width": 3.2,
## "Petal.Length": 5.9,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3.3,
## "Petal.Length": 5.7,
## "Petal.Width": 2.5,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.7,
## "Sepal.Width": 3,
## "Petal.Length": 5.2,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.3,
## "Sepal.Width": 2.5,
## "Petal.Length": 5,
## "Petal.Width": 1.9,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.5,
## "Sepal.Width": 3,
## "Petal.Length": 5.2,
## "Petal.Width": 2,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 6.2,
## "Sepal.Width": 3.4,
## "Petal.Length": 5.4,
## "Petal.Width": 2.3,
## "Species": "virginica"
## },
## {
## "Sepal.Length": 5.9,
## "Sepal.Width": 3,
## "Petal.Length": 5.1,
## "Petal.Width": 1.8,
## "Species": "virginica"
## }
## ]
converting back to JSON
iris2 <- fromJSON(myjson)
head(iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
0109 : Data.table
# install.packages("data.table")
library(data.table)
DF = data.frame(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DF,3)
## x y z
## 1 -0.2982594 a 1.645733
## 2 1.1067457 a -1.126810
## 3 -0.4976846 a 0.666414
DT = data.table(x=rnorm(9),y=rep(c("a","b","c"),each=3),z=rnorm(9))
head(DT,3)
## x y z
## 1: -0.1212826 a 0.7768203
## 2: 1.8888526 a 2.2866758
## 3: 1.1187508 a 0.5435984
seeing all data tables in memory
tables()
## NAME NROW NCOL MB COLS KEY
## [1,] DT 9 3 1 x,y,z
## Total: 1MB
Subsetting rows
DT[,]
## x y z
## 1: -0.1212826 a 0.7768203
## 2: 1.8888526 a 2.2866758
## 3: 1.1187508 a 0.5435984
## 4: -0.7669255 b 0.4105388
## 5: -1.3945684 b 1.0768423
## 6: -0.2158023 b -1.6659843
## 7: 0.6181563 c 0.2287406
## 8: -1.4027316 c -0.2630617
## 9: -1.6008934 c 0.5742421
DT[2,]
## x y z
## 1: 1.888853 a 2.286676
DT[DT$y=="a",]
## x y z
## 1: -0.1212826 a 0.7768203
## 2: 1.8888526 a 2.2866758
## 3: 1.1187508 a 0.5435984
DT[c(2,3)] # row wise
## x y z
## 1: 1.888853 a 2.2866758
## 2: 1.118751 a 0.5435984
DT[,c(2,3)] # col wise subsetting doesnot happen
## y z
## 1: a 0.7768203
## 2: a 2.2866758
## 3: a 0.5435984
## 4: b 0.4105388
## 5: b 1.0768423
## 6: b -1.6659843
## 7: c 0.2287406
## 8: c -0.2630617
## 9: c 0.5742421
column subsetting in data.table
{
x=1
y=2
}
k={print(10);5}
## [1] 10
print(k)
## [1] 5
Using expressions to represent datasets
DT[,list(mean(x),sum(z))]
## V1 V2
## 1: -0.2084938 3.968412
DT[,table((y))]
##
## a b c
## 3 3 3
add new columns
DT[,w:=z^ 2]
DT2 <- DT
DT[,y:=2]
## Warning in `[.data.table`(DT, , `:=`(y, 2)): Coerced 'double' RHS to
## 'character' to match the column's type; may have truncated precision.
## Either change the target column to 'double' first (by creating a new
## 'double' vector length 9 (nrows of entire table) and assign that; i.e.
## 'replace' column), or coerce RHS to 'character' (e.g. 1L, NA_[real|
## integer]_, as.*, etc) to make your intent clear and for speed. Or, set the
## column type correctly up front when you create the table and stick to it,
## please.
head(DT, n=3)
## x y z w
## 1: -0.1212826 2 0.7768203 0.6034498
## 2: 1.8888526 2 2.2866758 5.2288863
## 3: 1.1187508 2 0.5435984 0.2954992
head(DT2,n=3)
## x y z w
## 1: -0.1212826 2 0.7768203 0.6034498
## 2: 1.8888526 2 2.2866758 5.2288863
## 3: 1.1187508 2 0.5435984 0.2954992
multi step operations
DT[,m:={tmp <- (x+z); log2(tmp+5)}]
DT
## x y z w m
## 1: -0.1212826 2 0.7768203 0.60344984 2.499664
## 2: 1.8888526 2 2.2866758 5.22888635 3.197791
## 3: 1.1187508 2 0.5435984 0.29549920 2.736031
## 4: -0.7669255 2 0.4105388 0.16854214 2.215248
## 5: -1.3945684 2 1.0768423 1.15958942 2.227209
## 6: -0.2158023 2 -1.6659843 2.77550373 1.640720
## 7: 0.6181563 2 0.2287406 0.05232227 2.547671
## 8: -1.4027316 2 -0.2630617 0.06920146 1.737344
## 9: -1.6008934 2 0.5742421 0.32975396 1.990355
plyr like operations
DT[,a:=x>0]
DT[,b:=mean(x+w),by=a]
special variables
set.seed(123);
DT <- data.table(x=sample(letters[1:3], 1E5,TRUE))
DT[, .N, by=x]
## x N
## 1: a 33387
## 2: c 33201
## 3: b 33412
keys
DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
setkey(DT,x)
DT['a']
## x y
## 1: a 0.25958973
## 2: a 0.91751072
## 3: a -0.72231834
## 4: a -0.80828402
## 5: a -0.14135202
## 6: a 2.25701345
## 7: a -2.37955015
## 8: a -0.45425393
## 9: a -0.06007418
## 10: a 0.86090061
## 11: a -1.78466393
## 12: a -0.13074225
## 13: a -0.36983749
## 14: a -0.18065990
## 15: a -1.04973030
## 16: a 0.37831550
## 17: a -1.37079353
## 18: a -0.31611578
## 19: a 0.39435003
## 20: a -1.68987831
## 21: a -1.46233527
## 22: a 2.55837664
## 23: a 0.08788697
## 24: a 1.73141492
## 25: a 1.21512638
## 26: a 0.29954390
## 27: a -0.17245754
## 28: a 1.13249663
## 29: a 0.02319828
## 30: a 1.33587399
## 31: a -1.09879007
## 32: a -0.58176064
## 33: a 0.03892452
## 34: a 1.07315441
## 35: a 1.34969593
## 36: a 1.19527937
## 37: a -0.02217912
## 38: a 0.69849448
## 39: a 0.67240626
## 40: a -0.79164585
## 41: a -0.21790545
## 42: a 0.02307037
## 43: a 0.11539395
## 44: a -0.27708029
## 45: a 0.03688377
## 46: a 0.47520014
## 47: a 1.70748924
## 48: a 1.07600560
## 49: a -1.34571320
## 50: a -1.44024891
## 51: a -0.39392783
## 52: a 0.58106297
## 53: a -0.17078819
## 54: a -0.90585446
## 55: a 0.15621346
## 56: a -0.37322530
## 57: a -0.34587104
## 58: a -0.35828720
## 59: a -0.13306601
## 60: a -0.08959642
## 61: a 0.62793032
## 62: a -1.42882873
## 63: a 0.17255399
## 64: a -0.79115025
## 65: a 1.26204078
## 66: a -0.26940548
## 67: a 0.15698296
## 68: a -0.76059823
## 69: a 1.37060069
## 70: a 0.03758155
## 71: a 0.44949417
## 72: a 2.78868764
## 73: a -0.46848614
## 74: a 1.01260608
## 75: a -0.04374086
## 76: a 1.40669725
## 77: a 0.41992874
## 78: a 0.31008615
## 79: a 1.11904687
## 80: a -1.29814018
## 81: a -1.28248182
## 82: a 1.65942788
## 83: a 0.78374544
## 84: a 0.57771022
## 85: a -0.26724640
## 86: a -0.64569141
## 87: a -0.44952912
## 88: a -0.82619821
## 89: a 1.05503854
## 90: a -0.87926983
## 91: a -1.27712832
## 92: a -0.63412243
## 93: a 0.66470047
## 94: a -0.50958183
## 95: a 0.40736335
## 96: a 1.67774776
## 97: a -1.05205570
## 98: a -0.63690737
## 99: a 0.56539163
## 100: a 0.38015779
## x y
joins
DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4)
DT2 <- data.table(x=c('a','b','dt2'),z=5:7)
setkey(DT1, x);
setkey(DT2, x)
merge(DT1,DT2)
## x y z
## 1: a 1 5
## 2: a 2 5
## 3: b 3 6
To read files quick
big_df <- data.frame(x=rnorm(1E6), y=rnorm(1E6))
file <- tempfile()
write.table(big_df, file=file, row.names = FALSE, col.names = TRUE, sep="\t",quote=FALSE)
system.time(fread(file))
## user system elapsed
## 1.50 0.01 1.59
system.time(read.table(file,header=TRUE,sep="\t"))
## user system elapsed
## 5.31 0.05 5.39