revisedR.R

ls()

## character(0)

rm(list=ls())
gc() #comments

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 291811  7.8     592000 15.9   460000 12.3
## Vcells 327365  2.5     786432  6.0   677388  5.2

sessionInfo()

## R version 3.2.2 (2015-08-14)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
## [1] magrittr_1.5    tools_3.2.2     htmltools_0.2.6 stringi_0.5-5  
## [5] rmarkdown_0.7   knitr_1.10.5    stringr_1.0.0   digest_0.6.8   
## [9] evaluate_0.7

getwd()

## [1] "C:/Users/dell/Desktop"

dir()

##  [1] "1.png"                                                              
##  [2] "2.png"                                                              
##  [3] "5128OS_09_01.jpg"                                                   
##  [4] "adult.data.txt"                                                     
##  [5] "airline.sas7bdat"                                                   
##  [6] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
##  [7] "basicR.html"                                                        
##  [8] "basicR.R"                                                           
##  [9] "BigDiamonds.csv"                                                    
## [10] "BigDiamonds.csv.zip"                                                
## [11] "Boston.csv"                                                         
## [12] "Cars.sav"                                                           
## [13] "casestudy"                                                          
## [14] "ccFraud.csv"                                                        
## [15] "ccFraud.zip"                                                        
## [16] "Certificate Doc.docx"                                               
## [17] "CmapServer Download _ Cmap.html"                                    
## [18] "CmapServer Download _ Cmap_files"                                   
## [19] "Coxcombs.jpg"                                                       
## [20] "cricketparsing.R"                                                   
## [21] "data input.R"                                                       
## [22] "data_input.html"                                                    
## [23] "dataq.html"                                                         
## [24] "dataq.R"                                                            
## [25] "dataqualityinR.html"                                                
## [26] "dataqualityinR.R"                                                   
## [27] "datatable"                                                          
## [28] "datatablerevised.html"                                              
## [29] "datatablerevised.R"                                                 
## [30] "day8 session 4.fbr"                                                 
## [31] "day9 session 1.fbr"                                                 
## [32] "desktop.ini"                                                        
## [33] "Dropbox.lnk"                                                        
## [34] "exam.html"                                                          
## [35] "exam.R"                                                             
## [36] "ie_data.xls"                                                        
## [37] "lastsave.txt"                                                       
## [38] "lastsave2"                                                          
## [39] "library.docx"                                                       
## [40] "Minard.png"                                                         
## [41] "modules"                                                            
## [42] "modules.zip"                                                        
## [43] "my first code.R"                                                    
## [44] "mycode.docx"                                                        
## [45] "mycode.html"                                                        
## [46] "mycode.R"                                                           
## [47] "myfirstRcode.R"                                                     
## [48] "New folder"                                                         
## [49] "New Folder (2)"                                                     
## [50] "new1"                                                               
## [51] "Quiz 1 R.docx"                                                      
## [52] "revisedR.R"                                                         
## [53] "revisedR.spin.R"                                                    
## [54] "revisedR.spin.Rmd"                                                  
## [55] "rfmanalysis2.html"                                                  
## [56] "rfmanalysis2.R"                                                     
## [57] "rsconnect"                                                          
## [58] "SnowMap_Points.png"                                                 
## [59] "test.csv"                                                           
## [60] "Untitled (3).wma"                                                   
## [61] "Untitled (3).wma.wav"                                               
## [62] "Untitled 88.wma"

dir(pattern = "\\.(csv|CSV)$")

## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"                                                    
## [3] "Boston.csv"                                                         
## [4] "ccFraud.csv"                                                        
## [5] "test.csv"

dir()[grep(".csv",dir())]

## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"                                                    
## [3] "BigDiamonds.csv.zip"                                                
## [4] "Boston.csv"                                                         
## [5] "ccFraud.csv"                                                        
## [6] "test.csv"

ab=NULL
for (i in 1:length(dir()))
{
ab[i]=ifelse(
  grepl(".csv",dir()),dir()[i],"NA"
)
}

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

## Warning in ab[i] = ifelse(grepl(".csv", dir()), dir()[i], "NA"): number of
## items to replace is not a multiple of replacement length

ab

##  [1] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [15] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [29] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [43] "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA"
## [57] "NA" "NA" "NA" "NA" "NA" "NA"

#?grep
#??pattern
getwd()

## [1] "C:/Users/dell/Desktop"

setwd( "C:/Users/dell/Desktop")
list.files()

##  [1] "1.png"                                                              
##  [2] "2.png"                                                              
##  [3] "5128OS_09_01.jpg"                                                   
##  [4] "adult.data.txt"                                                     
##  [5] "airline.sas7bdat"                                                   
##  [6] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
##  [7] "basicR.html"                                                        
##  [8] "basicR.R"                                                           
##  [9] "BigDiamonds.csv"                                                    
## [10] "BigDiamonds.csv.zip"                                                
## [11] "Boston.csv"                                                         
## [12] "Cars.sav"                                                           
## [13] "casestudy"                                                          
## [14] "ccFraud.csv"                                                        
## [15] "ccFraud.zip"                                                        
## [16] "Certificate Doc.docx"                                               
## [17] "CmapServer Download _ Cmap.html"                                    
## [18] "CmapServer Download _ Cmap_files"                                   
## [19] "Coxcombs.jpg"                                                       
## [20] "cricketparsing.R"                                                   
## [21] "data input.R"                                                       
## [22] "data_input.html"                                                    
## [23] "dataq.html"                                                         
## [24] "dataq.R"                                                            
## [25] "dataqualityinR.html"                                                
## [26] "dataqualityinR.R"                                                   
## [27] "datatable"                                                          
## [28] "datatablerevised.html"                                              
## [29] "datatablerevised.R"                                                 
## [30] "day8 session 4.fbr"                                                 
## [31] "day9 session 1.fbr"                                                 
## [32] "desktop.ini"                                                        
## [33] "Dropbox.lnk"                                                        
## [34] "exam.html"                                                          
## [35] "exam.R"                                                             
## [36] "ie_data.xls"                                                        
## [37] "lastsave.txt"                                                       
## [38] "lastsave2"                                                          
## [39] "library.docx"                                                       
## [40] "Minard.png"                                                         
## [41] "modules"                                                            
## [42] "modules.zip"                                                        
## [43] "my first code.R"                                                    
## [44] "mycode.docx"                                                        
## [45] "mycode.html"                                                        
## [46] "mycode.R"                                                           
## [47] "myfirstRcode.R"                                                     
## [48] "New folder"                                                         
## [49] "New Folder (2)"                                                     
## [50] "new1"                                                               
## [51] "Quiz 1 R.docx"                                                      
## [52] "revisedR.R"                                                         
## [53] "revisedR.spin.R"                                                    
## [54] "revisedR.spin.Rmd"                                                  
## [55] "rfmanalysis2.html"                                                  
## [56] "rfmanalysis2.R"                                                     
## [57] "rsconnect"                                                          
## [58] "SnowMap_Points.png"                                                 
## [59] "test.csv"                                                           
## [60] "Untitled (3).wma"                                                   
## [61] "Untitled (3).wma.wav"                                               
## [62] "Untitled 88.wma"

dir(pattern = "\\.(csv|CSV)$")

## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"                                                    
## [3] "Boston.csv"                                                         
## [4] "ccFraud.csv"                                                        
## [5] "test.csv"

boston=read.csv("Boston.csv")
head(boston)

##   X    crim zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 1 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98   24
## 2  9.14   NA
## 3  4.03   NA
## 4  2.94   NA
## 5  5.33   NA
## 6  5.21   NA

tail(boston)

##       X    crim zn indus chas   nox    rm  age    dis rad tax ptratio
## 501 501 0.22438  0  9.69    0 0.585 6.027 79.7 2.4982   6 391    19.2
## 502 502 0.06263  0 11.93    0 0.573 6.593 69.1 2.4786   1 273    21.0
## 503 503 0.04527  0 11.93    0 0.573 6.120 76.7 2.2875   1 273    21.0
## 504 504 0.06076  0 11.93    0 0.573 6.976 91.0 2.1675   1 273    21.0
## 505 505 0.10959  0 11.93    0 0.573 6.794 89.3 2.3889   1 273    21.0
## 506 506 0.04741  0 11.93    0 0.573 6.030 80.8 2.5050   1 273    21.0
##      black lstat medv
## 501 396.90 14.33 16.8
## 502 391.99  9.67 22.4
## 503 396.90  9.08 20.6
## 504 396.90  5.64 23.9
## 505 393.45  6.48 22.0
## 506 396.90  7.88 11.9

table(is.na(boston))

## 
## FALSE  TRUE 
##  7551    39

boston2=na.omit(boston)
str(boston2)

## 'data.frame':    467 obs. of  15 variables:
##  $ X      : int  1 41 42 43 44 45 46 47 48 49 ...
##  $ crim   : num  0.00632 0.03359 0.12744 0.1415 0.15936 ...
##  $ zn     : num  18 75 0 0 0 0 0 0 0 0 ...
##  $ indus  : num  2.31 2.95 6.91 6.91 6.91 6.91 6.91 6.91 6.91 6.91 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.428 0.448 0.448 0.448 0.448 0.448 0.448 0.448 0.448 ...
##  $ rm     : num  6.58 7.02 6.77 6.17 6.21 ...
##  $ age    : num  65.2 15.8 2.9 6.6 6.5 40 33.8 33.3 85.5 95.3 ...
##  $ dis    : num  4.09 5.4 5.72 5.72 5.72 ...
##  $ rad    : int  1 3 3 3 3 3 3 3 3 3 ...
##  $ tax    : int  296 252 233 233 233 233 233 233 233 233 ...
##  $ ptratio: num  15.3 18.3 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 ...
##  $ black  : num  397 396 385 383 394 ...
##  $ lstat  : num  4.98 1.98 4.84 5.81 7.44 ...
##  $ medv   : num  24 34.9 26.6 25.3 24.7 21.2 19.3 20 16.6 14.4 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:39] 2 3 4 5 6 7 8 9 10 11 ...
##   .. ..- attr(*, "names")= chr [1:39] "2" "3" "4" "5" ...

#summary
summary(boston2)

##        X              crim                zn             indus      
##  Min.   :  1.0   Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46  
##  1st Qu.:156.5   1st Qu.: 0.08082   1st Qu.:  0.00   1st Qu.: 5.13  
##  Median :273.0   Median : 0.25199   Median :  0.00   Median :10.01  
##  Mean   :272.9   Mean   : 3.86626   Mean   : 11.96   Mean   :11.46  
##  3rd Qu.:389.5   3rd Qu.: 4.30505   3rd Qu.: 20.00   3rd Qu.:18.10  
##  Max.   :506.0   Max.   :88.97620   Max.   :100.00   Max.   :27.74  
##       chas              nox               rm             age        
##  Min.   :0.00000   Min.   :0.3850   Min.   :3.561   Min.   :  2.90  
##  1st Qu.:0.00000   1st Qu.:0.4480   1st Qu.:5.888   1st Qu.: 43.10  
##  Median :0.00000   Median :0.5440   Median :6.229   Median : 76.70  
##  Mean   :0.07495   Mean   :0.5577   Mean   :6.302   Mean   : 68.10  
##  3rd Qu.:0.00000   3rd Qu.:0.6470   3rd Qu.:6.635   3rd Qu.: 93.95  
##  Max.   :1.00000   Max.   :0.8710   Max.   :8.780   Max.   :100.00  
##       dis              rad             tax           ptratio     
##  Min.   : 1.130   Min.   : 1.00   Min.   :187.0   Min.   :12.60  
##  1st Qu.: 2.031   1st Qu.: 4.00   1st Qu.:277.0   1st Qu.:17.00  
##  Median : 2.894   Median : 5.00   Median :358.0   Median :18.90  
##  Mean   : 3.721   Mean   :10.01   Mean   :417.8   Mean   :18.38  
##  3rd Qu.: 5.118   3rd Qu.:24.00   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :12.127   Max.   :24.00   Max.   :711.0   Max.   :22.00  
##      black            lstat             medv      
##  Min.   :  0.32   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:374.71   1st Qu.: 6.865   1st Qu.:17.20  
##  Median :391.45   Median :11.100   Median :21.50  
##  Mean   :355.21   Mean   :12.563   Mean   :22.75  
##  3rd Qu.:396.26   3rd Qu.:16.820   3rd Qu.:25.15  
##  Max.   :396.90   Max.   :37.970   Max.   :50.00

summary(boston2$medv)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.20   21.50   22.75   25.15   50.00

summary(boston[15])

##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.20  
##  Median :21.50  
##  Mean   :22.75  
##  3rd Qu.:25.15  
##  Max.   :50.00  
##  NA's   :39

summary(boston[,15])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.00   17.20   21.50   22.75   25.15   50.00      39

attach(boston)
summary(medv)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5.00   17.20   21.50   22.75   25.15   50.00      39

#groupby
library(sqldf)

## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI

sqldf("select chas, avg(medv) from boston group by chas")

## Loading required package: tcltk

##   chas avg(medv)
## 1    0  22.28588
## 2    1  28.44000

sqldf("select chas, avg(medv) from boston2 group by chas")

##   chas avg(medv)
## 1    0  22.28588
## 2    1  28.44000

library(Hmisc)

## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

attach(boston2)

## The following objects are masked from boston:
## 
##     age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
##     rad, rm, tax, X, zn

summarize(medv,chas,mean)

##   chas     medv
## 1    0 22.28588
## 2    1 28.44000

dir(pattern = "\\.csv")

## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"                                                    
## [3] "BigDiamonds.csv.zip"                                                
## [4] "Boston.csv"                                                         
## [5] "ccFraud.csv"                                                        
## [6] "test.csv"

library(data.table)
BigDiamonds=fread("BigDiamonds.csv")

## 
Read 21.7% of 598024 rows
Read 45.1% of 598024 rows
Read 66.9% of 598024 rows
Read 73.6% of 598024 rows
Read 85.3% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:08

tables()

##      NAME           NROW NCOL MB
## [1,] BigDiamonds 598,024   13 75
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
##      KEY
## [1,]    
## Total: 75MB

setkey(BigDiamonds,color)
tables()

##      NAME           NROW NCOL MB
## [1,] BigDiamonds 598,024   13 75
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
##      KEY  
## [1,] color
## Total: 75MB

BigDiamonds

##             V1 carat    cut color clarity table depth cert
##      1:      4  0.21 V.Good     D      I1    60  60.6  GIA
##      2:      8  0.22 V.Good     D      I1    61  59.2  GIA
##      3:     13  0.22 V.Good     D     SI2    57  59.7  GIA
##      4:     22  0.22   Good     D     VS2    61  63.7  GIA
##      5:     24  0.21   Good     D     SI2    62  64.4  IGI
##     ---                                                   
## 598020: 596914  8.00 V.Good     L    VVS2    59  61.6  IGI
## 598021: 597020  6.03 V.Good     L      IF    63  58.4  GIA
## 598022: 597649  7.05  Ideal     L     SI1    56  62.5  GIA
## 598023: 597674  7.17 V.Good     L     VS2    63  56.1  GIA
## 598024: 597720  7.04  Ideal     L     SI1    59  59.8  GIA
##                     measurements price     x     y     z
##      1:       3.80 x 3.82 x 2.31    NA  3.80  3.82  2.31
##      2:       3.95 x 3.97 x 2.34    NA  3.95  3.97  2.34
##      3:       3.94 x 3.93 x 2.35    NA  3.94  3.93  2.35
##      4:       3.77 x 3.73 x 2.39    NA  3.77  3.73  2.39
##      5:       3.73 x 3.78 x 2.42    NA  3.73  3.78  2.42
##     ---                                                 
## 598020:     12.72 x 12.63 x 7.84 86650 12.72 12.63  7.84
## 598021:     11.97 x 11.94 x 6.98 87950 11.97 11.94  6.98
## 598022:     7.69 x 12.25 x 12.35 95000  7.69 12.25 12.35
## 598023: 12.90  x  12.97  x  7.25 95253 12.90 12.97  7.25
## 598024:     7.51 x 12.49 x 12.62 95814  7.51 12.49 12.62

boston[4,14]

## [1] 2.94

class(boston)

## [1] "data.frame"

BigDiamonds[carat>3,.(mean(price,rm=T),.N),cert]

##          cert       V1     N
## 1:        EGL 37473.03  3583
## 2:      OTHER 38444.25   363
## 3:    EGL USA 40325.90  1886
## 4: EGL ISRAEL 35920.33  1100
## 5:  EGL Intl. 35876.87  1060
## 6:        HRD 50919.86  1797
## 7:        IGI 45074.78  1660
## 8:        GIA 53947.51 14296
## 9:        AGS 47334.37   297

BigDiamonds[,.(mean(price,rm=T),.N),cert]

##          cert        V1      N
## 1:        GIA        NA 463555
## 2:        IGI        NA  43667
## 3:        EGL        NA  33814
## 4:  EGL Intl.  8964.944  11447
## 5:    EGL USA        NA  16079
## 6:      OTHER        NA   5267
## 7: EGL ISRAEL  9781.358  11301
## 8:        AGS 14041.455   2958
## 9:        HRD 16951.688   9936

class(BigDiamonds)

## [1] "data.table" "data.frame"

boston=data.table(boston)
tables()

##      NAME           NROW NCOL MB
## [1,] BigDiamonds 598,024   13 75
## [2,] boston          506   15  1
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## [2,] X,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
##      KEY  
## [1,] color
## [2,]      
## Total: 76MB

#Do this for cut and color 
#for carat>3 and all
#for mean price and mean carat


BigDiamonds[carat>3,
            .(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
            cut]

##       cut       V1       V2     N
## 1:   Good 43480.94 3.675146  1642
## 2: V.Good 45957.94 3.616123  6946
## 3:  Ideal 49433.78 3.548771 17454

BigDiamonds[,
            .(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
            cut]

##       cut       V1        V2      N
## 1: V.Good 7430.527 1.0247597 168896
## 2:   Good 5254.792 0.9003031  59680
## 3:  Ideal 9919.277 1.1201936 369448

BigDiamonds[carat>3,
            .(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
            color]

##    color       V1       V2    N
## 1:     D 53688.65 3.523589  705
## 2:     E 56865.67 3.416525 1249
## 3:     F 54854.90 3.405824 2354
## 4:     G 53586.24 3.446023 3586
## 5:     H 51792.50 3.535999 4971
## 6:     I 47932.87 3.616689 4521
## 7:     J 43105.64 3.668633 4601
## 8:     K 38597.37 3.713372 3034
## 9:     L 32981.18 3.812008 1021

BigDiamonds[,
            .(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
            color]

##    color       V1        V2     N
## 1:     D 8266.346 0.8266182 73630
## 2:     E 7282.990 0.8318824 93483
## 3:     F 8234.730 0.9410532 93573
## 4:     G 8984.200 1.0638408 96204
## 5:     H 9941.795 1.2099407 86619
## 6:     I 9541.319 1.2712823 70282
## 7:     J 9423.581 1.3475399 48709
## 8:     K 9694.257 1.4950646 25868
## 9:     L 7109.228 1.3632705  9656

BigDiamonds[carat>3,
            .(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
            .(cut,color)]

##        cut color       V1       V2    N
##  1:   Good     D 46690.23 3.541250   48
##  2: V.Good     D 51962.22 3.517608  255
##  3:  Ideal     D 55619.40 3.525274  402
##  4: V.Good     E 54031.27 3.409032  372
##  5:   Good     E 46289.66 3.462113   71
##  6:  Ideal     E 59105.49 3.415968  806
##  7:  Ideal     F 57837.19 3.401535 1518
##  8:   Good     F 48276.57 3.513913  161
##  9: V.Good     F 49717.12 3.389689  675
## 10:  Ideal     G 55205.09 3.449066 2366
## 11:   Good     G 48977.63 3.442813  192
## 12: V.Good     G 50721.12 3.439621 1028
## 13:   Good     H 46664.41 3.581705  264
## 14:  Ideal     H 53656.42 3.524833 3569
## 15: V.Good     H 47136.52 3.560413 1138
## 16:   Good     I 40890.21 3.729776  313
## 17: V.Good     I 45864.52 3.632090 1062
## 18:  Ideal     I 49331.78 3.600238 3146
## 19: V.Good     J 42397.76 3.759270 1233
## 20:   Good     J 40872.16 3.794743  272
## 21:  Ideal     J 43583.78 3.621457 3096
## 22: V.Good     K 39664.82 3.845306  882
## 23:  Ideal     K 37794.88 3.640901 1953
## 24:   Good     K 41741.92 3.839849  199
## 25:  Ideal     L 31706.87 3.706706  598
## 26: V.Good     L 35093.76 3.962458  301
## 27:   Good     L 34015.24 3.956967  122
##        cut color       V1       V2    N

BigDiamonds[,
            .(mean(price,na.rm=T),mean(carat,na.rm=T),.N),
            .(cut,color)]

##        cut color        V1        V2     N
##  1: V.Good     D  6443.311 0.7976032 21591
##  2:   Good     D  4656.791 0.7382723  6604
##  3:  Ideal     D  9654.869 0.8532475 45435
##  4:   Good     E  4185.086 0.7284208  9733
##  5:  Ideal     E  8341.317 0.8541993 55547
##  6: V.Good     E  6259.823 0.8236333 28203
##  7:   Good     F  5070.773 0.8157051  9141
##  8: V.Good     F  7003.693 0.9147793 26284
##  9:  Ideal     F  9285.867 0.9726345 58148
## 10:   Good     G  5661.293 0.9046262  8923
## 11: V.Good     G  8075.610 1.0461379 25214
## 12:  Ideal     G  9826.261 1.0939217 62067
## 13: V.Good     H  8114.545 1.1350359 22993
## 14:   Good     H  6152.773 1.0027921  7600
## 15:  Ideal     H 11203.491 1.2687815 56026
## 16:   Good     I  5540.319 1.0024743  7380
## 17: V.Good     I  8031.530 1.1803588 19902
## 18:  Ideal     I 10923.938 1.3595000 43000
## 19:   Good     J  5609.094 1.0590106  5357
## 20: V.Good     J  8394.493 1.2758884 13912
## 21:  Ideal     J 10599.596 1.4339008 29440
## 22: V.Good     K  8865.398 1.4151655  7672
## 23:   Good     K  5752.414 1.1153879  3467
## 24:  Ideal     K 11050.691 1.6260527 14729
## 25:   Good     L  5156.013 1.1368339  1475
## 26: V.Good     L  6542.255 1.2831232  3125
## 27:  Ideal     L  8027.597 1.4788667  5056
##        cut color        V1        V2     N

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

diamonds2=select(BigDiamonds,carat,price,cut,color,cert)
tables()

##      NAME           NROW NCOL MB
## [1,] BigDiamonds 598,024   13 75
## [2,] boston          506   15  1
## [3,] diamonds2   598,024    5 14
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## [2,] X,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
## [3,] carat,price,cut,color,cert                                          
##      KEY  
## [1,] color
## [2,]      
## [3,] color
## Total: 90MB

diamonds3=filter(diamonds2,carat>3)
by_color <- group_by(diamonds3, color)

summarise(by_color,
          count = n(),
          avg_price = mean(price, na.rm = TRUE),
          avg_size = mean(carat, na.rm = TRUE))

## Source: local data table [9 x 4]
## 
##   color count avg_price avg_size
## 1     D   705  53688.65 3.523589
## 2     E  1249  56865.67 3.416525
## 3     F  2354  54854.90 3.405824
## 4     G  3586  53586.24 3.446023
## 5     H  4971  51792.50 3.535999
## 6     I  4521  47932.87 3.616689
## 7     J  4601  43105.64 3.668633
## 8     K  3034  38597.37 3.713372
## 9     L  1021  32981.18 3.812008

diamonds4=mutate(diamonds3,price_by_carat=price/carat)
diamonds4

##        carat price    cut color  cert price_by_carat
##     1:  4.72  2850   Good     D   EGL       603.8136
##     2:  3.01 10588   Good     D   EGL      3517.6080
##     3:  3.02 10883   Good     D   EGL      3603.6424
##     4:  3.01 11350 V.Good     D OTHER      3770.7641
##     5:  3.01 11811 V.Good     D   EGL      3923.9203
##    ---                                              
## 26038:  8.00 86650 V.Good     L   IGI     10831.2500
## 26039:  6.03 87950 V.Good     L   GIA     14585.4063
## 26040:  7.05 95000  Ideal     L   GIA     13475.1773
## 26041:  7.17 95253 V.Good     L   GIA     13284.9372
## 26042:  7.04 95814  Ideal     L   GIA     13609.9432

by_color <- group_by(diamonds4, color)

summarise(by_color,
          count = n(),
          avg_price_carat = mean(price_by_carat, na.rm = TRUE))

## Source: local data table [9 x 3]
## 
##   color count avg_price_carat
## 1     D   705       15810.288
## 2     E  1249       17328.455
## 3     F  2354       16474.422
## 4     G  3586       15873.049
## 5     H  4971       14733.579
## 6     I  4521       13205.101
## 7     J  4601       11555.528
## 8     K  3034       10155.675
## 9     L  1021        8378.305

boston2$factor_chas=as.factor(boston2$chas)
library(ggplot2)
ggplot() +
  geom_density(aes(x = age,
                   y = ..density..,
                   colour = factor_chas),data=boston2)

ggplot() +
  geom_density(aes(x = medv,
                   y = ..density..,
                   colour = factor_chas),data=boston2)

#qplot(medv, data=boston2, geom="density",fill=as.factor(chas),alpha=I(0.5))

#try it for big diamonds data set for price,carat across for cut, color, cert seperately

#qplot(price, data=diamonds4,     geom="density",   fill=as.factor(cut),  alpha=I(0.2))

diamonds4

##        carat price    cut color  cert price_by_carat
##     1:  4.72  2850   Good     D   EGL       603.8136
##     2:  3.01 10588   Good     D   EGL      3517.6080
##     3:  3.02 10883   Good     D   EGL      3603.6424
##     4:  3.01 11350 V.Good     D OTHER      3770.7641
##     5:  3.01 11811 V.Good     D   EGL      3923.9203
##    ---                                              
## 26038:  8.00 86650 V.Good     L   IGI     10831.2500
## 26039:  6.03 87950 V.Good     L   GIA     14585.4063
## 26040:  7.05 95000  Ideal     L   GIA     13475.1773
## 26041:  7.17 95253 V.Good     L   GIA     13284.9372
## 26042:  7.04 95814  Ideal     L   GIA     13609.9432

ggplot(diamonds4, aes(x=price,color=cut)) + 
  geom_density()+
  facet_grid(.~color)

ggplot(diamonds4, aes(x=price,color=cut)) + 
  geom_density()+
  facet_grid(color~.)

ggplot(diamonds, aes(x=price,color=clarity)) + geom_density()+facet_grid(cut~.)

tables()

##      NAME           NROW NCOL MB
## [1,] BigDiamonds 598,024   13 75
## [2,] boston          506   15  1
## [3,] by_color     26,042    6  1
## [4,] diamonds2   598,024    5 14
## [5,] diamonds3    26,042    5  1
## [6,] diamonds4    26,042    6  1
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## [2,] X,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
## [3,] carat,price,cut,color,cert,price_by_carat                           
## [4,] carat,price,cut,color,cert                                          
## [5,] carat,price,cut,color,cert                                          
## [6,] carat,price,cut,color,cert,price_by_carat                           
##      KEY  
## [1,] color
## [2,]      
## [3,] color
## [4,] color
## [5,] color
## [6,] color
## Total: 93MB

ggplot(BigDiamonds, aes(x=price,y=carat,color=clarity)) + geom_point()  + facet_grid(cut~.)#group by

## Warning: Removed 265 rows containing missing values (geom_point).

## Warning: Removed 102 rows containing missing values (geom_point).

## Warning: Removed 346 rows containing missing values (geom_point).

# Answer these questions 15 minute quiz for Big Diamonds

# Which color is most expensive 
# Which cut is least expensive
# Which clarity gives best price /carat size
# Which cert gives least price /carat size
#BUT ANSWER THEM ONLY USING GGPLOT AND NOT USING TABLES

revisedR.R

dell

Sat Oct 17 20:01:31 2015