memory.size()
## [1] 30.81
memory.limit()
## [1] 8096
ls()
## character(0)
rm(x)
## Warning in rm(x): object 'x' not found
rm(list = ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 363605 19.5     592000 31.7   460000 24.6
## Vcells 557409  4.3    1023718  7.9   861327  6.6
memory.size()
## [1] 30.24
getwd()
## [1] "C:/Users/Dell/Desktop"
setwd("C:\\Users\\Dell\\Downloads")
dir()
##   [1] "140749_2017.pdf"                                                        
##   [2] "2011-F01-0700-Rev4-MDDS.XLSX"                                           
##   [3] "20150817143155.pdf"                                                     
##   [4] "20160111060911.pdf"                                                     
##   [5] "20170214052225.pdf"                                                     
##   [6] "7z1604-x64.exe"                                                         
##   [7] "7z1604.exe"                                                             
##   [8] "861415_10151432783238421_2124270505_o (1).jpg"                          
##   [9] "861415_10151432783238421_2124270505_o.jpg"                              
##  [10] "AirPassengers.csv"                                                      
##  [11] "ajayo.jpg"                                                              
##  [12] "Alison Python  Invoice   - Sheet1.pdf"                                  
##  [13] "Alison SAS  Invoice   - Sheet1.pdf"                                     
##  [14] "All+CSV+Files+in+a+Folder.ipynb"                                        
##  [15] "Allison Interview Jones Invoice   - Sheet1.pdf"                         
##  [16] "Anaconda3-4.2.0-Windows-x86_64.exe"                                     
##  [17] "anscombe+dataset.ipynb"                                                 
##  [18] "apachehttpd.exe"                                                        
##  [19] "April invoice adaptive analytics   - Sheet1.pdf"                        
##  [20] "Assignment14_BusinessAnalytics (1).docx"                                
##  [21] "Assignment14_BusinessAnalytics.docx"                                    
##  [22] "Assignment15_BusinessAnalytics.docx"                                    
##  [23] "Assignment16_BusinessAnalytics (1).docx"                                
##  [24] "Assignment16_BusinessAnalytics (2).docx"                                
##  [25] "Assignment16_BusinessAnalytics.docx"                                    
##  [26] "aug ust 2008.JPG"                                                       
##  [27] "avast_free_antivirus_setup_online.exe"                                  
##  [28] "avinash_ltv.zip"                                                        
##  [29] "BigDiamonds.csv"                                                        
##  [30] "BigDiamonds.csv (1).zip"                                                
##  [31] "BigDiamonds.csv (2)"                                                    
##  [32] "BigDiamonds.csv (2).zip"                                                
##  [33] "BigDiamonds.csv (3).zip"                                                
##  [34] "BigDiamonds.csv.zip"                                                    
##  [35] "Boston (1).csv"                                                         
##  [36] "Boston.csv"                                                             
##  [37] "CAM- Ajay Ohri (1).pdf"                                                 
##  [38] "CAM- Ajay Ohri.pdf"                                                     
##  [39] "camtasia.exe"                                                           
##  [40] "ccFraud.csv"                                                            
##  [41] "Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf"
##  [42] "CHAP1-6PythonforRUsersAnapproachforDataScience.docx"                    
##  [43] "chapter+3+_+spark.html"                                                 
##  [44] "chi+square+test.ipynb"                                                  
##  [45] "chromeinstall-8u111.exe"                                                
##  [46] "Cisco_WebEx_Add-On.exe"                                                 
##  [47] "class+exercise+data+viz.ipynb"                                          
##  [48] "class2.csv"                                                             
##  [49] "Collabera Invoice (1).pdf"                                              
##  [50] "Collabera Invoice.pdf"                                                  
##  [51] "Collectcent Invoice.pdf"                                                
##  [52] "college degrees.pdf"                                                    
##  [53] "DAP 1.pdf"                                                              
##  [54] "DAP 1.pptx"                                                             
##  [55] "DAP 6 RDBMS and SQL.pdf"                                                
##  [56] "DAP 6 RDBMS and SQL.pptx"                                               
##  [57] "Data Analysis (1).7z"                                                   
##  [58] "Data Analysis (1).rar"                                                  
##  [59] "Data Analysis (2).rar"                                                  
##  [60] "Data Analysis (3).rar"                                                  
##  [61] "Data Analysis.rar"                                                      
##  [62] "Data Viz.pptx"                                                          
##  [63] "data+exploration.ipynb"                                                 
##  [64] "data+manipulation.ipynb"                                                
##  [65] "data+munging+again.ipynb"                                               
##  [66] "data+viz.ipynb"                                                         
##  [67] "data+wrangling+titanic+dataset.ipynb"                                   
##  [68] "data1.csv"                                                              
##  [69] "datasets.csv"                                                           
##  [70] "Decision Trees.pdf"                                                     
##  [71] "DecisionStatsOfferLetter.docx"                                          
##  [72] "DecisionStatsRelievingLetter.docx"                                      
##  [73] "descriptive+stats+in+Python.ipynb"                                      
##  [74] "desktop.ini"                                                            
##  [75] "Diamond (1).csv"                                                        
##  [76] "Diamond (2).csv"                                                        
##  [77] "Diamond (3).csv"                                                        
##  [78] "Diamond (4).csv"                                                        
##  [79] "Diamond (5).csv"                                                        
##  [80] "Diamond (6).csv"                                                        
##  [81] "Diamond (7).csv"                                                        
##  [82] "Diamond (8).csv"                                                        
##  [83] "Diamond.csv"                                                            
##  [84] "DolbyVoiceClient.msi"                                                   
##  [85] "DropboxInstaller.exe"                                                   
##  [86] "edb_npgsql.exe"                                                         
##  [87] "edb_pgjdbc.exe"                                                         
##  [88] "edb_psqlodbc.exe"                                                       
##  [89] "edb_psqlodbc.exe-20170203172812"                                        
##  [90] "edb_psqlodbc.exe-20170307203617"                                        
##  [91] "final invoice edureka  - Sheet1.pdf"                                    
##  [92] "final_webinar (1).pdf"                                                  
##  [93] "final_webinar.pdf"                                                      
##  [94] "FinalPythonforRUsersAnapproachforDataScience (1).docx"                  
##  [95] "FinalPythonforRUsersAnapproachforDataScience (2).docx"                  
##  [96] "FinalPythonforRUsersAnapproachforDataScience (3).docx"                  
##  [97] "FinalPythonforRUsersAnapproachforDataScience (4).docx"                  
##  [98] "FinalPythonforRUsersAnapproachforDataScience.docx"                      
##  [99] "Git-2.11.0-64-bit.exe"                                                  
## [100] "Git-2.12.0-64-bit.exe"                                                  
## [101] "GitHubSetup (1).exe"                                                    
## [102] "GitHubSetup (2).exe"                                                    
## [103] "GitHubSetup.exe"                                                        
## [104] "GOMAUDIOGLOBALSETUP.EXE"                                                
## [105] "Hdma.csv"                                                               
## [106] "Hedonic.csv"                                                            
## [107] "HP Downloads"                                                           
## [108] "HPSupportSolutionsFramework-12.5.32.203.exe"                            
## [109] "image.png"                                                              
## [110] "IMS PROSCHOOL Workshop.pptx.pdf"                                        
## [111] "IMS PROSCHOOL Workshop.pptx.pptx"                                       
## [112] "internship.docx"                                                        
## [113] "Introduction to SAS (1).pdf"                                            
## [114] "Introduction to SAS Part 1 (1).pdf"                                     
## [115] "Introduction to SAS Part 1.pdf"                                         
## [116] "Introduction to SAS.pdf"                                                
## [117] "introductory+python.ipynb"                                              
## [118] "Invoice for Digital Vidya.pdf"                                          
## [119] "Invoice for Weekendr.pdf"                                               
## [120] "Invoice format - Ajay Ohri CONTATA (1).xls"                             
## [121] "Invoice format - Ajay Ohri CONTATA.xls"                                 
## [122] "invoice rapid miner.pdf"                                                
## [123] "Invoice trafla format.docx"                                             
## [124] "iris2 (1).ipynb"                                                        
## [125] "iris2 (2).ipynb"                                                        
## [126] "iris2.ipynb"                                                            
## [127] "January invoice Indicus  .pdf"                                          
## [128] "June AV   Invoice   - Sheet1.pdf"                                       
## [129] "Lecture 6 - KNN & Naive Bayes.ppt"                                      
## [130] "Local Disk (C) - Shortcut.lnk"                                          
## [131] "logistic regression - script for ppt.R"                                 
## [132] "logistic_regression_-_script_for_ppt.html"                              
## [133] "lyncentry.exe"                                                          
## [134] "Machine+Learning++Part+1 (1).ipynb"                                     
## [135] "Machine+Learning++Part+1.ipynb"                                         
## [136] "March invoice Indicus   - Sheet1.pdf"                                   
## [137] "matplotlib+cars.ipynb"                                                  
## [138] "matplotlib+line+graph.ipynb"                                            
## [139] "mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi"                     
## [140] "mongodb-win32-x86_64-3.4.2-signed.msi"                                  
## [141] "mortDefault"                                                            
## [142] "mortDefault.zip"                                                        
## [143] "mtcarslm.R"                                                             
## [144] "multiple+file+concat+in+pandas (1).ipynb"                               
## [145] "multiple+file+concat+in+pandas.ipynb"                                   
## [146] "my+first+class+in+python.ipynb"                                         
## [147] "nltk.ipynb"                                                             
## [148] "notebook-Copy1.html"                                                    
## [149] "Offer Letter - Ajay Ohri (1).pdf"                                       
## [150] "Offer Letter - Ajay Ohri.pdf"                                           
## [151] "Other Data Mining  Methods (1).pdf"                                     
## [152] "Other Data Mining  Methods.pdf"                                         
## [153] "output1 (1).xls"                                                        
## [154] "output1 (2).xls"                                                        
## [155] "output1.xls"                                                            
## [156] "pandas+11.ipynb"                                                        
## [157] "pandas+analysis+1.ipynb"                                                
## [158] "pandas+data+manipulation.ipynb"                                         
## [159] "passport image.pdf"                                                     
## [160] "Pawconinvoice2016.pdf"                                                  
## [161] "Pawconinvoice2017 (1).pdf"                                              
## [162] "Pawconinvoice2017 (2).pdf"                                              
## [163] "Pawconinvoice2017 (3).pdf"                                              
## [164] "Pawconinvoice2017.pdf"                                                  
## [165] "Payslip Feb 2016 - Sheet1.pdf"                                          
## [166] "Payslip Feb 2016.pdf"                                                   
## [167] "Payslip Format Decisionstats - Sheet1.pdf"                              
## [168] "Payslip Jan 2016 - Sheet1.pdf"                                          
## [169] "Payslip Jan 2016.pdf"                                                   
## [170] "Payslip March 2016 - Sheet1.pdf"                                        
## [171] "Payslip March 2016.pdf"                                                 
## [172] "pgd.csv"                                                                
## [173] "postgresql-9.6.1-1-windows-x64.exe"                                     
## [174] "Program 1-results.rtf"                                                  
## [175] "protein.csv"                                                            
## [176] "Python.docx"                                                            
## [177] "python+with+postgres (1).ipynb"                                         
## [178] "python+with+postgres.ipynb"                                             
## [179] "R-3.3.2-win.exe"                                                        
## [180] "R-3.3.3-win.exe"                                                        
## [181] "RCertificationExam.pdf"                                                 
## [182] "reg+model.ipynb"                                                        
## [183] "Revision -  Business Analytics (1).pdf"                                 
## [184] "Revision -  Business Analytics.pdf"                                     
## [185] "RidingMowers.csv"                                                       
## [186] "rsconnect"                                                              
## [187] "RStudio-1.0.136.exe"                                                    
## [188] "Salary Slip, Feb 2016.pdf"                                              
## [189] "Salary Slip, Jan 2016.pdf"                                              
## [190] "Salary Slip, March 2016 (1).pdf"                                        
## [191] "Salary Slip, March 2016 (2).pdf"                                        
## [192] "Salary Slip, March 2016.pdf"                                            
## [193] "sales-of-shampoo-over-a-three-ye.csv"                                   
## [194] "sas-university-edition-107140.pdf"                                      
## [195] "SAS part 2.pdf"                                                         
## [196] "SAS Part 3.pdf"                                                         
## [197] "Scan0095.pdf"                                                           
## [198] "Scanned Invoice for Collabera.pdf"                                      
## [199] "Screenshot 2017-01-23 12.36.55.png"                                     
## [200] "September invoice adaptive analytics   - Sheet1.pdf"                    
## [201] "simple+matplot+graph.ipynb"                                             
## [202] "Sollers January.pdf"                                                    
## [203] "sqlalchemy.ipynb"                                                       
## [204] "stackoverflow-dump-analysis.html"                                       
## [205] "Sunstone.pdf"                                                           
## [206] "Tableau.pdf"                                                            
## [207] "TableauPublicDesktop-64bit-10-1-3.exe"                                  
## [208] "TableauPublicDesktop-64bit-10-1-4.exe"                                  
## [209] "telecom.csv"                                                            
## [210] "TelecomServiceProviderCaseStudy.pdf"                                    
## [211] "test+web+scraping.ipynb"                                                
## [212] "Text Mining (1).pdf"                                                    
## [213] "Text Mining.pdf"                                                        
## [214] "third.sas7bdat"                                                         
## [215] "Time Series  Forecasting (1).pdf"                                       
## [216] "Time Series  Forecasting.pdf"                                           
## [217] "ts.html"                                                                
## [218] "ts.R"                                                                   
## [219] "Unconfirmed 373974.crdownload"                                          
## [220] "Unconfirmed 376991.crdownload"                                          
## [221] "Unconfirmed 950045.crdownload"                                          
## [222] "uTorrent.exe"                                                           
## [223] "VirtualBox-5.1.8-111374-Win (1).exe"                                    
## [224] "VirtualBox-5.1.8-111374-Win.exe"                                        
## [225] "visualcppbuildtools_full.exe"                                           
## [226] "Web+Scraping+Yelp+with+Beautiful+Soup.ipynb"                            
## [227] "Webinar for Business Analytics.pdf"                                     
## [228] "WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg"                         
## [229] "WhatsApp Image 2017-02-18 at 08.42.55.jpeg"
dir(pattern = 'csv')
##  [1] "AirPassengers.csv"                   
##  [2] "BigDiamonds.csv"                     
##  [3] "BigDiamonds.csv (1).zip"             
##  [4] "BigDiamonds.csv (2)"                 
##  [5] "BigDiamonds.csv (2).zip"             
##  [6] "BigDiamonds.csv (3).zip"             
##  [7] "BigDiamonds.csv.zip"                 
##  [8] "Boston (1).csv"                      
##  [9] "Boston.csv"                          
## [10] "ccFraud.csv"                         
## [11] "class2.csv"                          
## [12] "data1.csv"                           
## [13] "datasets.csv"                        
## [14] "Diamond (1).csv"                     
## [15] "Diamond (2).csv"                     
## [16] "Diamond (3).csv"                     
## [17] "Diamond (4).csv"                     
## [18] "Diamond (5).csv"                     
## [19] "Diamond (6).csv"                     
## [20] "Diamond (7).csv"                     
## [21] "Diamond (8).csv"                     
## [22] "Diamond.csv"                         
## [23] "Hdma.csv"                            
## [24] "Hedonic.csv"                         
## [25] "pgd.csv"                             
## [26] "protein.csv"                         
## [27] "RidingMowers.csv"                    
## [28] "sales-of-shampoo-over-a-three-ye.csv"
## [29] "telecom.csv"
#diamonds=read.csv("BigDiamonds.csv")

setwd("C:\\Users\\Dell\\Downloads\\BigDiamonds.csv")
dir()
## [1] "BigDiamonds.csv"
diamonds=read.csv("BigDiamonds.csv")
#install.packages("data.table")
library(data.table)
diamonds2=fread("BigDiamonds.csv")
## 
Read 18.4% of 598024 rows
Read 30.1% of 598024 rows
Read 41.8% of 598024 rows
Read 53.5% of 598024 rows
Read 65.2% of 598024 rows
Read 76.9% of 598024 rows
Read 88.6% of 598024 rows
Read 98.7% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:10
str(diamonds)
## 'data.frame':    598024 obs. of  13 variables:
##  $ X           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ carat       : num  0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
##  $ cut         : Factor w/ 3 levels "Good","Ideal",..: 3 1 1 3 3 1 1 3 3 1 ...
##  $ color       : Factor w/ 9 levels "D","E","F","G",..: 8 4 7 1 8 4 4 1 8 3 ...
##  $ clarity     : Factor w/ 9 levels "I1","I2","IF",..: 1 1 2 1 1 5 5 1 5 4 ...
##  $ table       : num  59 61 58 60 59 60 63 61 57.5 65 ...
##  $ depth       : num  63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
##  $ cert        : Factor w/ 9 levels "AGS","EGL","EGL Intl.",..: 6 6 6 6 2 6 6 6 8 6 ...
##  $ measurements: Factor w/ 241453 levels "","  3.99  x   3.95  x   2.44",..: 19960 21917 48457 15701 37341 14661 14400 19642 17115 16177 ...
##  $ price       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ x           : num  3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
##  $ y           : num  3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
##  $ z           : num  2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
str(diamonds2)
## Classes 'data.table' and 'data.frame':   598024 obs. of  13 variables:
##  $ V1          : chr  "1" "2" "3" "4" ...
##  $ carat       : num  0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
##  $ cut         : chr  "V.Good" "Good" "Good" "V.Good" ...
##  $ color       : chr  "K" "G" "J" "D" ...
##  $ clarity     : chr  "I1" "I1" "I2" "I1" ...
##  $ table       : num  59 61 58 60 59 60 63 61 57.5 65 ...
##  $ depth       : num  63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
##  $ cert        : chr  "GIA" "GIA" "GIA" "GIA" ...
##  $ measurements: chr  "3.96 x 3.95 x 2.52" "4.00 x 4.05 x 2.30" "4.56 x 4.53 x 2.67" "3.80 x 3.82 x 2.31" ...
##  $ price       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ x           : num  3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
##  $ y           : num  3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
##  $ z           : num  2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
##  - attr(*, ".internal.selfref")=<externalptr>
tables()
##      NAME         NROW NCOL  MB
## [1,] diamonds2 598,024   13 101
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
##      KEY
## [1,]    
## Total: 101MB
setkey(diamonds2,color)





tables()
##      NAME         NROW NCOL  MB
## [1,] diamonds2 598,024   13 101
##      COLS                                                                
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
##      KEY  
## [1,] color
## Total: 101MB
head(diamonds)
##   X carat    cut color clarity table depth cert       measurements price
## 1 1  0.25 V.Good     K      I1    59  63.7  GIA 3.96 x 3.95 x 2.52    NA
## 2 2  0.23   Good     G      I1    61  58.1  GIA 4.00 x 4.05 x 2.30    NA
## 3 3  0.34   Good     J      I2    58  58.7  GIA 4.56 x 4.53 x 2.67    NA
## 4 4  0.21 V.Good     D      I1    60  60.6  GIA 3.80 x 3.82 x 2.31    NA
## 5 5  0.31 V.Good     K      I1    59  62.2  EGL 4.35 x 4.26 x 2.68    NA
## 6 6  0.20   Good     G     SI2    60  64.4  GIA 3.74 x 3.67 x 2.38    NA
##      x    y    z
## 1 3.96 3.95 2.52
## 2 4.00 4.05 2.30
## 3 4.56 4.53 2.67
## 4 3.80 3.82 2.31
## 5 4.35 4.26 2.68
## 6 3.74 3.67 2.38
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#diamonds3=select(price,color,diamonds)
#?select
diamonds3=select(diamonds,price,color,cut)
#diamonds4=diamonds[color=="J",]
diamonds4=diamonds3[diamonds3$color=="J",]
summary(diamonds4)
##      price           color           cut       
##  Min.   :  300   J      :48709   Good  : 5357  
##  1st Qu.: 1575   D      :    0   Ideal :29440  
##  Median : 4697   E      :    0   V.Good:13912  
##  Mean   : 9424   F      :    0                 
##  3rd Qu.:11352   G      :    0                 
##  Max.   :99806   H      :    0                 
##  NA's   :64      (Other):    0
mean(diamonds4$price)
## [1] NA
mean(diamonds4$price,na.rm=T)
## [1] 9423.581
dim(diamonds)
## [1] 598024     13
diamonds=na.omit(diamonds)
dim(diamonds)
## [1] 593784     13
is.numeric(24)
## [1] TRUE
#is.na(23,45,56,NA,NA)
#ajay=(23,45,56,NA,NA)
ajay = c(23, 45, 56,78,NA,NA)
is.na(ajay)
## [1] FALSE FALSE FALSE FALSE  TRUE  TRUE
table(is.na(ajay))
## 
## FALSE  TRUE 
##     4     2
table(is.na(diamonds4))
## 
##  FALSE   TRUE 
## 146063     64
head(diamonds4)
##    price color    cut
## 3     NA     J   Good
## 16    NA     J V.Good
## 35    NA     J V.Good
## 51    NA     J   Good
## 56    NA     J V.Good
## 57    NA     J   Good
M=mean(diamonds4$price,na.rm=T)
#diamonds4$price=ifelse(is.na,M,diamonds4$price)
#diamonds4$price=ifelse(is.na(),M,diamonds4$price)
diamonds4$price=ifelse(is.na(diamonds4$price),M,diamonds4$price)
head(diamonds4)
##       price color    cut
## 3  9423.581     J   Good
## 16 9423.581     J V.Good
## 35 9423.581     J V.Good
## 51 9423.581     J   Good
## 56 9423.581     J V.Good
## 57 9423.581     J   Good
M
## [1] 9423.581
diamonds4$price=ifelse(is.na(diamonds4$price),M,diamonds4$price)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     diamonds
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
attach(diamonds3)
summarize(price,cut,mean)
##      cut price
## 1   Good    NA
## 2  Ideal    NA
## 3 V.Good    NA
names(diamonds)
##  [1] "X"            "carat"        "cut"          "color"       
##  [5] "clarity"      "table"        "depth"        "cert"        
##  [9] "measurements" "price"        "x"            "y"           
## [13] "z"
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
sqldf("select avg(price) as Average_price,cut from diamonds3 group by cut")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
##   Average_price    cut
## 1      5254.792   Good
## 2      9919.277  Ideal
## 3      7430.527 V.Good
#summarize(diamonds$price,diamonds$cut,min)
summary(diamonds)
##        X              carat           cut             color      
##  Min.   :   494   Min.   :0.200   Good  : 59149   G      :95363  
##  1st Qu.:149638   1st Qu.:0.500   Ideal :367026   E      :92859  
##  Median :299312   Median :0.900   V.Good:167609   F      :92772  
##  Mean   :299221   Mean   :1.073                   H      :85951  
##  3rd Qu.:448775   3rd Qu.:1.500                   D      :73201  
##  Max.   :598024   Max.   :9.250                   I      :69879  
##                                                   (Other):83759  
##     clarity           table           depth               cert       
##  SI1    :115898   Min.   : 0.00   Min.   : 0.00   GIA       :460036  
##  VS2    :110402   1st Qu.:56.00   1st Qu.:61.00   IGI       : 43339  
##  SI2    :103671   Median :58.00   Median :62.00   EGL       : 33722  
##  VS1    : 97113   Mean   :57.66   Mean   :61.09   EGL USA   : 16019  
##  VVS2   : 65002   3rd Qu.:59.00   3rd Qu.:62.70   EGL Intl. : 11439  
##  VVS1   : 54284   Max.   :75.90   Max.   :81.30   EGL ISRAEL: 11285  
##  (Other): 47414                                   (Other)   : 17944  
##              measurements        price             x         
##  4.3 x 4.27 x 2.67 :    97   Min.   :  300   Min.   : 0.150  
##  4.31 x 4.29 x 2.68:    87   1st Qu.: 1218   1st Qu.: 4.740  
##  4.29 x 4.26 x 2.67:    86   Median : 3503   Median : 5.780  
##  4.3 x 4.28 x 2.67 :    84   Mean   : 8756   Mean   : 5.992  
##  4.3 x 4.28 x 2.68 :    83   3rd Qu.:11186   3rd Qu.: 6.970  
##  4.29 x 4.26 x 2.66:    80   Max.   :99990   Max.   :13.890  
##  (Other)           :593267                                   
##        y                z         
##  Min.   : 1.000   Min.   : 0.040  
##  1st Qu.: 4.970   1st Qu.: 3.120  
##  Median : 6.050   Median : 3.860  
##  Mean   : 6.201   Mean   : 4.036  
##  3rd Qu.: 7.230   3rd Qu.: 4.610  
##  Max.   :13.890   Max.   :13.180  
## 
diamonds$pricepercarat=diamonds$price/diamonds$carat
summary(diamonds)
##        X              carat           cut             color      
##  Min.   :   494   Min.   :0.200   Good  : 59149   G      :95363  
##  1st Qu.:149638   1st Qu.:0.500   Ideal :367026   E      :92859  
##  Median :299312   Median :0.900   V.Good:167609   F      :92772  
##  Mean   :299221   Mean   :1.073                   H      :85951  
##  3rd Qu.:448775   3rd Qu.:1.500                   D      :73201  
##  Max.   :598024   Max.   :9.250                   I      :69879  
##                                                   (Other):83759  
##     clarity           table           depth               cert       
##  SI1    :115898   Min.   : 0.00   Min.   : 0.00   GIA       :460036  
##  VS2    :110402   1st Qu.:56.00   1st Qu.:61.00   IGI       : 43339  
##  SI2    :103671   Median :58.00   Median :62.00   EGL       : 33722  
##  VS1    : 97113   Mean   :57.66   Mean   :61.09   EGL USA   : 16019  
##  VVS2   : 65002   3rd Qu.:59.00   3rd Qu.:62.70   EGL Intl. : 11439  
##  VVS1   : 54284   Max.   :75.90   Max.   :81.30   EGL ISRAEL: 11285  
##  (Other): 47414                                   (Other)   : 17944  
##              measurements        price             x         
##  4.3 x 4.27 x 2.67 :    97   Min.   :  300   Min.   : 0.150  
##  4.31 x 4.29 x 2.68:    87   1st Qu.: 1218   1st Qu.: 4.740  
##  4.29 x 4.26 x 2.67:    86   Median : 3503   Median : 5.780  
##  4.3 x 4.28 x 2.67 :    84   Mean   : 8756   Mean   : 5.992  
##  4.3 x 4.28 x 2.68 :    83   3rd Qu.:11186   3rd Qu.: 6.970  
##  4.29 x 4.26 x 2.66:    80   Max.   :99990   Max.   :13.890  
##  (Other)           :593267                                   
##        y                z          pricepercarat  
##  Min.   : 1.000   Min.   : 0.040   Min.   :  525  
##  1st Qu.: 4.970   1st Qu.: 3.120   1st Qu.: 2667  
##  Median : 6.050   Median : 3.860   Median : 4173  
##  Mean   : 6.201   Mean   : 4.036   Mean   : 5789  
##  3rd Qu.: 7.230   3rd Qu.: 4.610   3rd Qu.: 7437  
##  Max.   :13.890   Max.   :13.180   Max.   :49519  
## 
summarize(diamonds$pricepercarat,diamonds$color,max)
##   diamonds$color diamonds$pricepercarat
## 1              D               49519.40
## 2              E               40871.01
## 3              F               37084.06
## 4              G               32998.33
## 5              H               31718.95
## 6              I               24982.45
## 7              J               22890.12
## 8              K               19516.77
## 9              L               14585.41
summarize(diamonds$pricepercarat,diamonds$color,median)
##   diamonds$color diamonds$pricepercarat
## 1              D               4114.000
## 2              E               3836.000
## 3              F               4124.876
## 4              G               4314.286
## 5              H               4551.111
## 6              I               4514.706
## 7              J               4330.348
## 8              K               3851.258
## 9              L               2800.000
a=summarize(diamonds$pricepercarat,diamonds$clarity,min)
summarize(diamonds$pricepercarat,diamonds$color,min)
##   diamonds$color diamonds$pricepercarat
## 1              D               603.8136
## 2              E               604.7809
## 3              F               671.1111
## 4              G               654.2373
## 5              H               660.8696
## 6              I               525.0000
## 7              J               630.6122
## 8              K               637.6812
## 9              L               683.3333
boxplot(diamonds$pricepercarat~diamonds$clarity)

library(data.table)

diamonds3=data.table(diamonds)
diamonds3[,max(pricepercarat),color]
##    color       V1
## 1:     G 32998.33
## 2:     K 19516.77
## 3:     J 22890.12
## 4:     H 31718.95
## 5:     F 37084.06
## 6:     I 24982.45
## 7:     D 49519.40
## 8:     E 40871.01
## 9:     L 14585.41
diamonds3[,max(pricepercarat),cut]
##       cut       V1
## 1: V.Good 49519.40
## 2:   Good 43410.00
## 3:  Ideal 49481.59
diamonds3[,.(.N,mean(price),max(pricepercarat)),.(cut,color)]
##        cut color     N        V2       V3
##  1: V.Good     G 24990  8086.758 32670.72
##  2: V.Good     K  7580  8882.727 19516.77
##  3:   Good     J  5316  5614.735 19881.67
##  4:  Ideal     G 61569  9834.169 32998.33
##  5:   Good     H  7542  6149.439 24412.53
##  6:   Good     F  9042  5076.173 33099.67
##  7: V.Good     I 19761  8019.495 22702.73
##  8:  Ideal     I 42779 10922.554 24982.45
##  9: V.Good     D 21460  6430.541 49519.40
## 10: V.Good     H 22821  8116.492 28426.67
## 11:   Good     I  7339  5543.148 19340.26
## 12: V.Good     E 28016  6272.180 33876.97
## 13: V.Good     F 26027  7001.099 32670.86
## 14:  Ideal     H 55588 11195.620 31718.95
## 15:   Good     E  9623  4182.597 33196.01
## 16:   Good     G  8804  5660.256 29395.68
## 17:  Ideal     F 57703  9294.957 37084.06
## 18:  Ideal     K 14631 11063.548 18615.38
## 19: V.Good     J 13840  8384.011 19616.40
## 20:  Ideal     E 55220  8354.683 40871.01
## 21:   Good     D  6566  4656.203 43410.00
## 22:   Good     K  3449  5760.076 18257.46
## 23:  Ideal     L  5039  8024.461 14280.08
## 24: V.Good     L  3114  6550.520 14585.41
## 25:  Ideal     J 29322 10598.306 22890.12
## 26:  Ideal     D 45175  9669.691 49481.59
## 27:   Good     L  1468  5158.339 13622.38
##        cut color     N        V2       V3
library(sqldf)
head(diamonds3)
##      X carat    cut color clarity table depth cert       measurements
## 1: 494  0.24 V.Good     G     SI1  61.0  58.9  GIA 4.09 x 4.10 x 2.41
## 2: 495  0.31 V.Good     K     SI2  59.0  60.2  GIA 4.40 x 4.42 x 2.65
## 3: 496  0.26   Good     J     VS2  56.5  64.1  IGI 4.01 x 4.05 x 2.58
## 4: 497  0.24  Ideal     G     SI1  55.0  61.3  GIA 4.01 x 4.03 x 2.47
## 5: 498  0.30   Good     H      I1  57.0  62.2  GIA 4.21 x 4.24 x 2.63
## 6: 499  0.34   Good     F      I1  66.0  55.0  GIA 4.75 x 4.61 x 2.57
##    price    x    y    z pricepercarat
## 1:   300 4.09 4.10 2.41     1250.0000
## 2:   300 4.40 4.42 2.65      967.7419
## 3:   300 4.01 4.05 2.58     1153.8462
## 4:   300 4.01 4.03 2.47     1250.0000
## 5:   300 4.21 4.24 2.63     1000.0000
## 6:   300 4.75 4.61 2.57      882.3529
diamonds$X=NULL

sqldf("select max(pricepercarat),color
from
diamonds group by color")
##   max(pricepercarat) color
## 1           49519.40     D
## 2           40871.01     E
## 3           37084.06     F
## 4           32998.33     G
## 5           31718.95     H
## 6           24982.45     I
## 7           22890.12     J
## 8           19516.77     K
## 9           14585.41     L
library(nycflights13)
dim(flights)
## [1] 336776     19
head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
# 
# filter() (and slice())
# arrange()
# select() (and rename())
# distinct()
# mutate() (and transmute())
# summarise()
# sample_n() (and sample_frac())


filter(flights,month==1,day==1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
d=slice(flights,1:100)

arrange(d, year, month, day)
## # A tibble: 100 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 90 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
f=select(flights,year,month,day)

summary(f) 
##       year          month             day       
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00  
##  Median :2013   Median : 7.000   Median :16.00  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :2013   Max.   :12.000   Max.   :31.00
rename(f,The_Day=day)
## # A tibble: 336,776 × 3
##     year month The_Day
##    <int> <int>   <int>
## 1   2013     1       1
## 2   2013     1       1
## 3   2013     1       1
## 4   2013     1       1
## 5   2013     1       1
## 6   2013     1       1
## 7   2013     1       1
## 8   2013     1       1
## 9   2013     1       1
## 10  2013     1       1
## # ... with 336,766 more rows
distinct(flights,day)
## # A tibble: 31 × 1
##      day
##    <int>
## 1      1
## 2      2
## 3      3
## 4      4
## 5      5
## 6      6
## 7      7
## 8      8
## 9      9
## 10    10
## # ... with 21 more rows
names(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
distinct(flights,origin)
## # A tibble: 3 × 1
##   origin
##    <chr>
## 1    EWR
## 2    LGA
## 3    JFK
flights2=mutate(flights,
                speed = distance / air_time * 60)

summary(flights2$speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    76.8   358.1   404.2   394.3   438.8   703.4    9430
summarise(flights2,delay=mean(dep_delay,na.rm=T))
## # A tibble: 1 × 1
##      delay
##      <dbl>
## 1 12.63907
mean(flights2$dep_delay,na.rm = T)
## [1] 12.63907
sample_n(flights,10)
## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     8     3     1828           1830        -2     2118
## 2   2013    11    17     1736           1512       144     1929
## 3   2013     8    16     2232           2059        93        1
## 4   2013    10    14      839            843        -4      942
## 5   2013     8     6     1806           1800         6     1940
## 6   2013    10     1     1756           1800        -4     1904
## 7   2013     7    11     1654           1700        -6     1815
## 8   2013     7    15      731            730         1     1015
## 9   2013     4    26     1157           1135        22     1413
## 10  2013     3    23     1649           1652        -3     1908
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
sample_frac(flights,0.001)
## # A tibble: 337 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     8    21     1250           1250         0     1424
## 2   2013    11    13     1715           1706         9     1819
## 3   2013     6    19     1312           1300        12     1421
## 4   2013     6    27      717            720        -3      907
## 5   2013     7     3     1831           1731        60     2015
## 6   2013     5    23     1828           1830        -2     2122
## 7   2013     1     4      928            919         9     1039
## 8   2013    10     2     1459           1455         4     1745
## 9   2013    12    26     2104           2035        29        6
## 10  2013    10     3     1352           1400        -8     1508
## # ... with 327 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
by_origin=group_by(flights,origin)

delay = summarise(by_origin,mean(arr_delay, na.rm = TRUE))
delay
## # A tibble: 3 × 2
##   origin `mean(arr_delay, na.rm = TRUE)`
##    <chr>                           <dbl>
## 1    EWR                        9.107055
## 2    JFK                        5.551481
## 3    LGA                        5.783488
#class exercise

names(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
test=select(flights,origin,dep_delay,arr_delay)

summary(test)
##     origin            dep_delay         arr_delay       
##  Length:336776      Min.   : -43.00   Min.   : -86.000  
##  Class :character   1st Qu.:  -5.00   1st Qu.: -17.000  
##  Mode  :character   Median :  -2.00   Median :  -5.000  
##                     Mean   :  12.64   Mean   :   6.895  
##                     3rd Qu.:  11.00   3rd Qu.:  14.000  
##                     Max.   :1301.00   Max.   :1272.000  
##                     NA's   :8255      NA's   :9430
table(test$origin)
## 
##    EWR    JFK    LGA 
## 120835 111279 104662
attach(test)
boxplot(arr_delay)

describe(arr_delay)
## arr_delay 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   327346     9430      577        1    6.895    39.23      -32      -26 
##      .25      .50      .75      .90      .95 
##      -17       -5       14       52       91 
## 
## lowest :  -86  -79  -75  -74  -73, highest:  989 1007 1109 1127 1272
test_arr=filter(test,arr_delay>0)
test_dep=filter(test,dep_delay>0)
names(test_arr)
## [1] "origin"    "dep_delay" "arr_delay"
gtest_arr=group_by(test_arr,origin)
gtest_dep=group_by(test_dep,origin)

summarise(gtest_arr,count=n(),
          delay=mean(arr_delay,na.rm = T),
          sum=sum(arr_delay,na.rm = T)/(24*60))
## # A tibble: 3 × 4
##   origin count    delay      sum
##    <chr> <int>    <dbl>    <dbl>
## 1    EWR 50099 41.75307 1452.630
## 2    JFK 42885 40.00993 1191.546
## 3    LGA 40020 38.93306 1082.015
summarise(gtest_dep,count=n(),
          delay=mean(dep_delay,na.rm = T),
          sum=sum(dep_delay,na.rm = T)/24*60)
## # A tibble: 3 × 4
##   origin count    delay     sum
##    <chr> <int>    <dbl>   <dbl>
## 1    EWR 52711 38.98792 5137730
## 2    JFK 42031 38.04677 3997860
## 3    LGA 33690 41.63096 3506368
glights=group_by(flights,origin)

summarise(glights,count=n())
## # A tibble: 3 × 2
##   origin  count
##    <chr>  <int>
## 1    EWR 120835
## 2    JFK 111279
## 3    LGA 104662