ls()
## character(0)
rm(list=ls())
gc() #comments
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 291811 7.8 592000 15.9 460000 12.3
## Vcells 327365 2.5 786432 6.0 677388 5.2
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] magrittr_1.5 tools_3.2.2 htmltools_0.2.6 stringi_0.5-5
## [5] rmarkdown_0.7 knitr_1.10.5 stringr_1.0.0 digest_0.6.8
## [9] evaluate_0.7
getwd()
## [1] "C:/Users/dell/Desktop"
dir()
## [1] "1.png"
## [2] "2.png"
## [3] "5128OS_09_01.jpg"
## [4] "adult.data.txt"
## [5] "airline.sas7bdat"
## [6] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [7] "basicR.html"
## [8] "basicR.R"
## [9] "BigDiamonds.csv"
## [10] "BigDiamonds.csv.zip"
## [11] "Boston.csv"
## [12] "Cars.sav"
## [13] "casestudy"
## [14] "ccFraud.csv"
## [15] "ccFraud.zip"
## [16] "Certificate Doc.docx"
## [17] "CmapServer Download _ Cmap.html"
## [18] "CmapServer Download _ Cmap_files"
## [19] "Coxcombs.jpg"
## [20] "cricketparsing.R"
## [21] "data input.R"
## [22] "data_input.html"
## [23] "dataq.html"
## [24] "dataq.R"
## [25] "dataqualityinR.html"
## [26] "dataqualityinR.R"
## [27] "datatable"
## [28] "datatablerevised.html"
## [29] "datatablerevised.R"
## [30] "day8 session 4.fbr"
## [31] "day9 session 1.fbr"
## [32] "desktop.ini"
## [33] "Dropbox.lnk"
## [34] "exam.html"
## [35] "exam.R"
## [36] "ie_data.xls"
## [37] "lastsave.txt"
## [38] "lastsave2"
## [39] "library.docx"
## [40] "Minard.png"
## [41] "modules"
## [42] "modules.zip"
## [43] "my first code.R"
## [44] "mycode.docx"
## [45] "mycode.html"
## [46] "mycode.R"
## [47] "myfirstRcode.R"
## [48] "New folder"
## [49] "New Folder (2)"
## [50] "new1"
## [51] "Quiz 1 R.docx"
## [52] "revisedR.R"
## [53] "revisedR.spin.R"
## [54] "revisedR.spin.Rmd"
## [55] "rfmanalysis2.html"
## [56] "rfmanalysis2.R"
## [57] "rsconnect"
## [58] "SnowMap_Points.png"
## [59] "test.csv"
## [60] "Untitled (3).wma"
## [61] "Untitled (3).wma.wav"
## [62] "Untitled 88.wma"
dir(pattern = "\\.(csv|CSV)$")
## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"
## [3] "Boston.csv"
## [4] "ccFraud.csv"
## [5] "test.csv"
dir()[grep(".csv",dir())]
## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"
## [3] "BigDiamonds.csv.zip"
## [4] "Boston.csv"
## [5] "ccFraud.csv"
## [6] "test.csv"
ab=NULL
for (i in 1:length(dir()))
{
ab[i]=ifelse(
grepl(".csv",dir()),dir()[i],"NA"
)
}
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length
ab
## [1] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [15] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [29] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [43] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [57] "NA" "NA" "NA" "NA" "NA" "NA"
#?grep
#??pattern
getwd()
## [1] "C:/Users/dell/Desktop"
setwd( "C:/Users/dell/Desktop")
list.files()
## [1] "1.png"
## [2] "2.png"
## [3] "5128OS_09_01.jpg"
## [4] "adult.data.txt"
## [5] "airline.sas7bdat"
## [6] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [7] "basicR.html"
## [8] "basicR.R"
## [9] "BigDiamonds.csv"
## [10] "BigDiamonds.csv.zip"
## [11] "Boston.csv"
## [12] "Cars.sav"
## [13] "casestudy"
## [14] "ccFraud.csv"
## [15] "ccFraud.zip"
## [16] "Certificate Doc.docx"
## [17] "CmapServer Download _ Cmap.html"
## [18] "CmapServer Download _ Cmap_files"
## [19] "Coxcombs.jpg"
## [20] "cricketparsing.R"
## [21] "data input.R"
## [22] "data_input.html"
## [23] "dataq.html"
## [24] "dataq.R"
## [25] "dataqualityinR.html"
## [26] "dataqualityinR.R"
## [27] "datatable"
## [28] "datatablerevised.html"
## [29] "datatablerevised.R"
## [30] "day8 session 4.fbr"
## [31] "day9 session 1.fbr"
## [32] "desktop.ini"
## [33] "Dropbox.lnk"
## [34] "exam.html"
## [35] "exam.R"
## [36] "ie_data.xls"
## [37] "lastsave.txt"
## [38] "lastsave2"
## [39] "library.docx"
## [40] "Minard.png"
## [41] "modules"
## [42] "modules.zip"
## [43] "my first code.R"
## [44] "mycode.docx"
## [45] "mycode.html"
## [46] "mycode.R"
## [47] "myfirstRcode.R"
## [48] "New folder"
## [49] "New Folder (2)"
## [50] "new1"
## [51] "Quiz 1 R.docx"
## [52] "revisedR.R"
## [53] "revisedR.spin.R"
## [54] "revisedR.spin.Rmd"
## [55] "rfmanalysis2.html"
## [56] "rfmanalysis2.R"
## [57] "rsconnect"
## [58] "SnowMap_Points.png"
## [59] "test.csv"
## [60] "Untitled (3).wma"
## [61] "Untitled (3).wma.wav"
## [62] "Untitled 88.wma"
dir(pattern = "\\.(csv|CSV)$")
## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"
## [3] "Boston.csv"
## [4] "ccFraud.csv"
## [5] "test.csv"
boston=read.csv("Boston.csv")
head(boston)
## X crim zn indus chas nox rm age dis rad tax ptratio black
## 1 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24
## 2 9.14 NA
## 3 4.03 NA
## 4 2.94 NA
## 5 5.33 NA
## 6 5.21 NA
tail(boston)
## X crim zn indus chas nox rm age dis rad tax ptratio
## 501 501 0.22438 0 9.69 0 0.585 6.027 79.7 2.4982 6 391 19.2
## 502 502 0.06263 0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0
## 503 503 0.04527 0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0
## 504 504 0.06076 0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0
## 505 505 0.10959 0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0
## 506 506 0.04741 0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0
## black lstat medv
## 501 396.90 14.33 16.8
## 502 391.99 9.67 22.4
## 503 396.90 9.08 20.6
## 504 396.90 5.64 23.9
## 505 393.45 6.48 22.0
## 506 396.90 7.88 11.9
table(is.na(boston))
##
## FALSE TRUE
## 7551 39
boston2=na.omit(boston)
str(boston2)
## 'data.frame': 467 obs. of 15 variables:
## $ X : int 1 41 42 43 44 45 46 47 48 49 ...
## $ crim : num 0.00632 0.03359 0.12744 0.1415 0.15936 ...
## $ zn : num 18 75 0 0 0 0 0 0 0 0 ...
## $ indus : num 2.31 2.95 6.91 6.91 6.91 6.91 6.91 6.91 6.91 6.91 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.428 0.448 0.448 0.448 0.448 0.448 0.448 0.448 0.448 ...
## $ rm : num 6.58 7.02 6.77 6.17 6.21 ...
## $ age : num 65.2 15.8 2.9 6.6 6.5 40 33.8 33.3 85.5 95.3 ...
## $ dis : num 4.09 5.4 5.72 5.72 5.72 ...
## $ rad : int 1 3 3 3 3 3 3 3 3 3 ...
## $ tax : int 296 252 233 233 233 233 233 233 233 233 ...
## $ ptratio: num 15.3 18.3 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 ...
## $ black : num 397 396 385 383 394 ...
## $ lstat : num 4.98 1.98 4.84 5.81 7.44 ...
## $ medv : num 24 34.9 26.6 25.3 24.7 21.2 19.3 20 16.6 14.4 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:39] 2 3 4 5 6 7 8 9 10 11 ...
## .. ..- attr(*, "names")= chr [1:39] "2" "3" "4" "5" ...
#summary
summary(boston2)
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:156.5 1st Qu.: 0.08082 1st Qu.: 0.00 1st Qu.: 5.13
## Median :273.0 Median : 0.25199 Median : 0.00 Median :10.01
## Mean :272.9 Mean : 3.86626 Mean : 11.96 Mean :11.46
## 3rd Qu.:389.5 3rd Qu.: 4.30505 3rd Qu.: 20.00 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4480 1st Qu.:5.888 1st Qu.: 43.10
## Median :0.00000 Median :0.5440 Median :6.229 Median : 76.70
## Mean :0.07495 Mean :0.5577 Mean :6.302 Mean : 68.10
## 3rd Qu.:0.00000 3rd Qu.:0.6470 3rd Qu.:6.635 3rd Qu.: 93.95
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
## dis rad tax ptratio
## Min. : 1.130 Min. : 1.00 Min. :187.0 Min. :12.60
## 1st Qu.: 2.031 1st Qu.: 4.00 1st Qu.:277.0 1st Qu.:17.00
## Median : 2.894 Median : 5.00 Median :358.0 Median :18.90
## Mean : 3.721 Mean :10.01 Mean :417.8 Mean :18.38
## 3rd Qu.: 5.118 3rd Qu.:24.00 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :12.127 Max. :24.00 Max. :711.0 Max. :22.00
## black lstat medv
## Min. : 0.32 Min. : 1.730 Min. : 5.00
## 1st Qu.:374.71 1st Qu.: 6.865 1st Qu.:17.20
## Median :391.45 Median :11.100 Median :21.50
## Mean :355.21 Mean :12.563 Mean :22.75
## 3rd Qu.:396.26 3rd Qu.:16.820 3rd Qu.:25.15
## Max. :396.90 Max. :37.970 Max. :50.00
summary(boston2$medv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 17.20 21.50 22.75 25.15 50.00
summary(boston[15])
## medv
## Min. : 5.00
## 1st Qu.:17.20
## Median :21.50
## Mean :22.75
## 3rd Qu.:25.15
## Max. :50.00
## NA's :39
summary(boston[,15])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.00 17.20 21.50 22.75 25.15 50.00 39
attach(boston)
summary(medv)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.00 17.20 21.50 22.75 25.15 50.00 39
#groupby
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
sqldf("select chas, avg(medv) from boston group by chas")
## Loading required package: tcltk
## chas avg(medv)
## 1 0 22.28588
## 2 1 28.44000
sqldf("select chas, avg(medv) from boston2 group by chas")
## chas avg(medv)
## 1 0 22.28588
## 2 1 28.44000
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
attach(boston2)
## The following objects are masked from boston:
##
## age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
## rad, rm, tax, X, zn
summarize(medv,chas,mean)
## chas medv
## 1 0 22.28588
## 2 1 28.44000
dir(pattern = "\\.csv")
## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"
## [3] "BigDiamonds.csv.zip"
## [4] "Boston.csv"
## [5] "ccFraud.csv"
## [6] "test.csv"
library(data.table)
BigDiamonds=fread("BigDiamonds.csv")
##
Read 21.7% of 598024 rows
Read 45.1% of 598024 rows
Read 66.9% of 598024 rows
Read 73.6% of 598024 rows
Read 85.3% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:08
tables()
## NAME NROW NCOL MB
## [1,] BigDiamonds 598,024 13 75
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## KEY
## [1,]
## Total: 75MB
setkey(BigDiamonds,color)
tables()
## NAME NROW NCOL MB
## [1,] BigDiamonds 598,024 13 75
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## KEY
## [1,] color
## Total: 75MB
BigDiamonds
## V1 carat cut color clarity table depth cert
## 1: 4 0.21 V.Good D I1 60 60.6 GIA
## 2: 8 0.22 V.Good D I1 61 59.2 GIA
## 3: 13 0.22 V.Good D SI2 57 59.7 GIA
## 4: 22 0.22 Good D VS2 61 63.7 GIA
## 5: 24 0.21 Good D SI2 62 64.4 IGI
## ---
## 598020: 596914 8.00 V.Good L VVS2 59 61.6 IGI
## 598021: 597020 6.03 V.Good L IF 63 58.4 GIA
## 598022: 597649 7.05 Ideal L SI1 56 62.5 GIA
## 598023: 597674 7.17 V.Good L VS2 63 56.1 GIA
## 598024: 597720 7.04 Ideal L SI1 59 59.8 GIA
## measurements price x y z
## 1: 3.80 x 3.82 x 2.31 NA 3.80 3.82 2.31
## 2: 3.95 x 3.97 x 2.34 NA 3.95 3.97 2.34
## 3: 3.94 x 3.93 x 2.35 NA 3.94 3.93 2.35
## 4: 3.77 x 3.73 x 2.39 NA 3.77 3.73 2.39
## 5: 3.73 x 3.78 x 2.42 NA 3.73 3.78 2.42
## ---
## 598020: 12.72 x 12.63 x 7.84 86650 12.72 12.63 7.84
## 598021: 11.97 x 11.94 x 6.98 87950 11.97 11.94 6.98
## 598022: 7.69 x 12.25 x 12.35 95000 7.69 12.25 12.35
## 598023: 12.90 x 12.97 x 7.25 95253 12.90 12.97 7.25
## 598024: 7.51 x 12.49 x 12.62 95814 7.51 12.49 12.62
boston[4,14]
## [1] 2.94
class(boston)
## [1] "data.frame"
BigDiamonds[carat>3,.(mean(price,rm=T),.N),cert]
## cert V1 N
## 1: EGL 37473.03 3583
## 2: OTHER 38444.25 363
## 3: EGL USA 40325.90 1886
## 4: EGL ISRAEL 35920.33 1100
## 5: EGL Intl. 35876.87 1060
## 6: HRD 50919.86 1797
## 7: IGI 45074.78 1660
## 8: GIA 53947.51 14296
## 9: AGS 47334.37 297
BigDiamonds[,.(mean(price,rm=T),.N),cert]
## cert V1 N
## 1: GIA NA 463555
## 2: IGI NA 43667
## 3: EGL NA 33814
## 4: EGL Intl. 8964.944 11447
## 5: EGL USA NA 16079
## 6: OTHER NA 5267
## 7: EGL ISRAEL 9781.358 11301
## 8: AGS 14041.455 2958
## 9: HRD 16951.688 9936
class(BigDiamonds)
## [1] "data.table" "data.frame"
boston=data.table(boston)
tables()
## NAME NROW NCOL MB
## [1,] BigDiamonds 598,024 13 75
## [2,] boston 506 15 1
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## [2,] X,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
## KEY
## [1,] color
## [2,]
## Total: 76MB
#Do this for cut and color
#for carat>3 and all
#for mean price and mean carat
BigDiamonds[carat>3,
.(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
cut]
## cut V1 V2 N
## 1: Good 43480.94 3.675146 1642
## 2: V.Good 45957.94 3.616123 6946
## 3: Ideal 49433.78 3.548771 17454
BigDiamonds[,
.(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
cut]
## cut V1 V2 N
## 1: V.Good 7430.527 1.0247597 168896
## 2: Good 5254.792 0.9003031 59680
## 3: Ideal 9919.277 1.1201936 369448
BigDiamonds[carat>3,
.(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
color]
## color V1 V2 N
## 1: D 53688.65 3.523589 705
## 2: E 56865.67 3.416525 1249
## 3: F 54854.90 3.405824 2354
## 4: G 53586.24 3.446023 3586
## 5: H 51792.50 3.535999 4971
## 6: I 47932.87 3.616689 4521
## 7: J 43105.64 3.668633 4601
## 8: K 38597.37 3.713372 3034
## 9: L 32981.18 3.812008 1021
BigDiamonds[,
.(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
color]
## color V1 V2 N
## 1: D 8266.346 0.8266182 73630
## 2: E 7282.990 0.8318824 93483
## 3: F 8234.730 0.9410532 93573
## 4: G 8984.200 1.0638408 96204
## 5: H 9941.795 1.2099407 86619
## 6: I 9541.319 1.2712823 70282
## 7: J 9423.581 1.3475399 48709
## 8: K 9694.257 1.4950646 25868
## 9: L 7109.228 1.3632705 9656
BigDiamonds[carat>3,
.(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
.(cut,color)]
## cut color V1 V2 N
## 1: Good D 46690.23 3.541250 48
## 2: V.Good D 51962.22 3.517608 255
## 3: Ideal D 55619.40 3.525274 402
## 4: V.Good E 54031.27 3.409032 372
## 5: Good E 46289.66 3.462113 71
## 6: Ideal E 59105.49 3.415968 806
## 7: Ideal F 57837.19 3.401535 1518
## 8: Good F 48276.57 3.513913 161
## 9: V.Good F 49717.12 3.389689 675
## 10: Ideal G 55205.09 3.449066 2366
## 11: Good G 48977.63 3.442813 192
## 12: V.Good G 50721.12 3.439621 1028
## 13: Good H 46664.41 3.581705 264
## 14: Ideal H 53656.42 3.524833 3569
## 15: V.Good H 47136.52 3.560413 1138
## 16: Good I 40890.21 3.729776 313
## 17: V.Good I 45864.52 3.632090 1062
## 18: Ideal I 49331.78 3.600238 3146
## 19: V.Good J 42397.76 3.759270 1233
## 20: Good J 40872.16 3.794743 272
## 21: Ideal J 43583.78 3.621457 3096
## 22: V.Good K 39664.82 3.845306 882
## 23: Ideal K 37794.88 3.640901 1953
## 24: Good K 41741.92 3.839849 199
## 25: Ideal L 31706.87 3.706706 598
## 26: V.Good L 35093.76 3.962458 301
## 27: Good L 34015.24 3.956967 122
## cut color V1 V2 N
BigDiamonds[,
.(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
.(cut,color)]
## cut color V1 V2 N
## 1: V.Good D 6443.311 0.7976032 21591
## 2: Good D 4656.791 0.7382723 6604
## 3: Ideal D 9654.869 0.8532475 45435
## 4: Good E 4185.086 0.7284208 9733
## 5: Ideal E 8341.317 0.8541993 55547
## 6: V.Good E 6259.823 0.8236333 28203
## 7: Good F 5070.773 0.8157051 9141
## 8: V.Good F 7003.693 0.9147793 26284
## 9: Ideal F 9285.867 0.9726345 58148
## 10: Good G 5661.293 0.9046262 8923
## 11: V.Good G 8075.610 1.0461379 25214
## 12: Ideal G 9826.261 1.0939217 62067
## 13: V.Good H 8114.545 1.1350359 22993
## 14: Good H 6152.773 1.0027921 7600
## 15: Ideal H 11203.491 1.2687815 56026
## 16: Good I 5540.319 1.0024743 7380
## 17: V.Good I 8031.530 1.1803588 19902
## 18: Ideal I 10923.938 1.3595000 43000
## 19: Good J 5609.094 1.0590106 5357
## 20: V.Good J 8394.493 1.2758884 13912
## 21: Ideal J 10599.596 1.4339008 29440
## 22: V.Good K 8865.398 1.4151655 7672
## 23: Good K 5752.414 1.1153879 3467
## 24: Ideal K 11050.691 1.6260527 14729
## 25: Good L 5156.013 1.1368339 1475
## 26: V.Good L 6542.255 1.2831232 3125
## 27: Ideal L 8027.597 1.4788667 5056
## cut color V1 V2 N
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
diamonds2=select(BigDiamonds,carat,price,cut,color,cert)
tables()
## NAME NROW NCOL MB
## [1,] BigDiamonds 598,024 13 75
## [2,] boston 506 15 1
## [3,] diamonds2 598,024 5 14
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## [2,] X,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
## [3,] carat,price,cut,color,cert
## KEY
## [1,] color
## [2,]
## [3,] color
## Total: 90MB
diamonds3=filter(diamonds2,carat>3)
by_color <- group_by(diamonds3, color)
summarise(by_color,
count = n(),
avg_price = mean(price, na.rm = TRUE),
avg_size = mean(carat, na.rm = TRUE))
## Source: local data table [9 x 4]
##
## color count avg_price avg_size
## 1 D 705 53688.65 3.523589
## 2 E 1249 56865.67 3.416525
## 3 F 2354 54854.90 3.405824
## 4 G 3586 53586.24 3.446023
## 5 H 4971 51792.50 3.535999
## 6 I 4521 47932.87 3.616689
## 7 J 4601 43105.64 3.668633
## 8 K 3034 38597.37 3.713372
## 9 L 1021 32981.18 3.812008
diamonds4=mutate(diamonds3,price_by_carat=price/carat)
diamonds4
## carat price cut color cert price_by_carat
## 1: 4.72 2850 Good D EGL 603.8136
## 2: 3.01 10588 Good D EGL 3517.6080
## 3: 3.02 10883 Good D EGL 3603.6424
## 4: 3.01 11350 V.Good D OTHER 3770.7641
## 5: 3.01 11811 V.Good D EGL 3923.9203
## ---
## 26038: 8.00 86650 V.Good L IGI 10831.2500
## 26039: 6.03 87950 V.Good L GIA 14585.4063
## 26040: 7.05 95000 Ideal L GIA 13475.1773
## 26041: 7.17 95253 V.Good L GIA 13284.9372
## 26042: 7.04 95814 Ideal L GIA 13609.9432
by_color <- group_by(diamonds4, color)
summarise(by_color,
count = n(),
avg_price_carat = mean(price_by_carat, na.rm = TRUE))
## Source: local data table [9 x 3]
##
## color count avg_price_carat
## 1 D 705 15810.288
## 2 E 1249 17328.455
## 3 F 2354 16474.422
## 4 G 3586 15873.049
## 5 H 4971 14733.579
## 6 I 4521 13205.101
## 7 J 4601 11555.528
## 8 K 3034 10155.675
## 9 L 1021 8378.305
boston2$factor_chas=as.factor(boston2$chas)
library(ggplot2)
ggplot() +
geom_density(aes(x = age,
y = ..density..,
colour = factor_chas),data=boston2)

ggplot() +
geom_density(aes(x = medv,
y = ..density..,
colour = factor_chas),data=boston2)

#qplot(medv, data=boston2, geom="density",fill=as.factor(chas),alpha=I(0.5))
#try it for big diamonds data set for price,carat across for cut, color, cert seperately
#qplot(price, data=diamonds4, geom="density", fill=as.factor(cut), alpha=I(0.2))
diamonds4
## carat price cut color cert price_by_carat
## 1: 4.72 2850 Good D EGL 603.8136
## 2: 3.01 10588 Good D EGL 3517.6080
## 3: 3.02 10883 Good D EGL 3603.6424
## 4: 3.01 11350 V.Good D OTHER 3770.7641
## 5: 3.01 11811 V.Good D EGL 3923.9203
## ---
## 26038: 8.00 86650 V.Good L IGI 10831.2500
## 26039: 6.03 87950 V.Good L GIA 14585.4063
## 26040: 7.05 95000 Ideal L GIA 13475.1773
## 26041: 7.17 95253 V.Good L GIA 13284.9372
## 26042: 7.04 95814 Ideal L GIA 13609.9432
ggplot(diamonds4, aes(x=price,color=cut)) +
geom_density()+
facet_grid(.~color)

ggplot(diamonds4, aes(x=price,color=cut)) +
geom_density()+
facet_grid(color~.)

ggplot(diamonds, aes(x=price,color=clarity)) + geom_density()+facet_grid(cut~.)

tables()
## NAME NROW NCOL MB
## [1,] BigDiamonds 598,024 13 75
## [2,] boston 506 15 1
## [3,] by_color 26,042 6 1
## [4,] diamonds2 598,024 5 14
## [5,] diamonds3 26,042 5 1
## [6,] diamonds4 26,042 6 1
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## [2,] X,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
## [3,] carat,price,cut,color,cert,price_by_carat
## [4,] carat,price,cut,color,cert
## [5,] carat,price,cut,color,cert
## [6,] carat,price,cut,color,cert,price_by_carat
## KEY
## [1,] color
## [2,]
## [3,] color
## [4,] color
## [5,] color
## [6,] color
## Total: 93MB
ggplot(BigDiamonds, aes(x=price,y=carat,color=clarity)) + geom_point() + facet_grid(cut~.)#group by
## Warning: Removed 265 rows containing missing values (geom_point).
## Warning: Removed 102 rows containing missing values (geom_point).
## Warning: Removed 346 rows containing missing values (geom_point).

# Answer these questions 15 minute quiz for Big Diamonds
# Which color is most expensive
# Which cut is least expensive
# Which clarity gives best price /carat size
# Which cert gives least price /carat size
#BUT ANSWER THEM ONLY USING GGPLOT AND NOT USING TABLES