memory.size()
## [1] 30.81
memory.limit()
## [1] 8096
ls()
## character(0)
rm(x)
## Warning in rm(x): object 'x' not found
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 363605 19.5 592000 31.7 460000 24.6
## Vcells 557409 4.3 1023718 7.9 861327 6.6
memory.size()
## [1] 30.24
getwd()
## [1] "C:/Users/Dell/Desktop"
setwd("C:\\Users\\Dell\\Downloads")
dir()
## [1] "140749_2017.pdf"
## [2] "2011-F01-0700-Rev4-MDDS.XLSX"
## [3] "20150817143155.pdf"
## [4] "20160111060911.pdf"
## [5] "20170214052225.pdf"
## [6] "7z1604-x64.exe"
## [7] "7z1604.exe"
## [8] "861415_10151432783238421_2124270505_o (1).jpg"
## [9] "861415_10151432783238421_2124270505_o.jpg"
## [10] "AirPassengers.csv"
## [11] "ajayo.jpg"
## [12] "Alison Python Invoice - Sheet1.pdf"
## [13] "Alison SAS Invoice - Sheet1.pdf"
## [14] "All+CSV+Files+in+a+Folder.ipynb"
## [15] "Allison Interview Jones Invoice - Sheet1.pdf"
## [16] "Anaconda3-4.2.0-Windows-x86_64.exe"
## [17] "anscombe+dataset.ipynb"
## [18] "apachehttpd.exe"
## [19] "April invoice adaptive analytics - Sheet1.pdf"
## [20] "Assignment14_BusinessAnalytics (1).docx"
## [21] "Assignment14_BusinessAnalytics.docx"
## [22] "Assignment15_BusinessAnalytics.docx"
## [23] "Assignment16_BusinessAnalytics (1).docx"
## [24] "Assignment16_BusinessAnalytics (2).docx"
## [25] "Assignment16_BusinessAnalytics.docx"
## [26] "aug ust 2008.JPG"
## [27] "avast_free_antivirus_setup_online.exe"
## [28] "avinash_ltv.zip"
## [29] "BigDiamonds.csv"
## [30] "BigDiamonds.csv (1).zip"
## [31] "BigDiamonds.csv (2)"
## [32] "BigDiamonds.csv (2).zip"
## [33] "BigDiamonds.csv (3).zip"
## [34] "BigDiamonds.csv.zip"
## [35] "Boston (1).csv"
## [36] "Boston.csv"
## [37] "CAM- Ajay Ohri (1).pdf"
## [38] "CAM- Ajay Ohri.pdf"
## [39] "camtasia.exe"
## [40] "ccFraud.csv"
## [41] "Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf"
## [42] "CHAP1-6PythonforRUsersAnapproachforDataScience.docx"
## [43] "chapter+3+_+spark.html"
## [44] "chi+square+test.ipynb"
## [45] "chromeinstall-8u111.exe"
## [46] "Cisco_WebEx_Add-On.exe"
## [47] "class+exercise+data+viz.ipynb"
## [48] "class2.csv"
## [49] "Collabera Invoice (1).pdf"
## [50] "Collabera Invoice.pdf"
## [51] "Collectcent Invoice.pdf"
## [52] "college degrees.pdf"
## [53] "DAP 1.pdf"
## [54] "DAP 1.pptx"
## [55] "DAP 6 RDBMS and SQL.pdf"
## [56] "DAP 6 RDBMS and SQL.pptx"
## [57] "Data Analysis (1).7z"
## [58] "Data Analysis (1).rar"
## [59] "Data Analysis (2).rar"
## [60] "Data Analysis (3).rar"
## [61] "Data Analysis.rar"
## [62] "Data Viz.pptx"
## [63] "data+exploration.ipynb"
## [64] "data+manipulation.ipynb"
## [65] "data+munging+again.ipynb"
## [66] "data+viz.ipynb"
## [67] "data+wrangling+titanic+dataset.ipynb"
## [68] "data1.csv"
## [69] "datasets.csv"
## [70] "Decision Trees.pdf"
## [71] "DecisionStatsOfferLetter.docx"
## [72] "DecisionStatsRelievingLetter.docx"
## [73] "descriptive+stats+in+Python.ipynb"
## [74] "desktop.ini"
## [75] "Diamond (1).csv"
## [76] "Diamond (2).csv"
## [77] "Diamond (3).csv"
## [78] "Diamond (4).csv"
## [79] "Diamond (5).csv"
## [80] "Diamond (6).csv"
## [81] "Diamond (7).csv"
## [82] "Diamond (8).csv"
## [83] "Diamond.csv"
## [84] "DolbyVoiceClient.msi"
## [85] "DropboxInstaller.exe"
## [86] "edb_npgsql.exe"
## [87] "edb_pgjdbc.exe"
## [88] "edb_psqlodbc.exe"
## [89] "edb_psqlodbc.exe-20170203172812"
## [90] "edb_psqlodbc.exe-20170307203617"
## [91] "final invoice edureka - Sheet1.pdf"
## [92] "final_webinar (1).pdf"
## [93] "final_webinar.pdf"
## [94] "FinalPythonforRUsersAnapproachforDataScience (1).docx"
## [95] "FinalPythonforRUsersAnapproachforDataScience (2).docx"
## [96] "FinalPythonforRUsersAnapproachforDataScience (3).docx"
## [97] "FinalPythonforRUsersAnapproachforDataScience (4).docx"
## [98] "FinalPythonforRUsersAnapproachforDataScience.docx"
## [99] "Git-2.11.0-64-bit.exe"
## [100] "Git-2.12.0-64-bit.exe"
## [101] "GitHubSetup (1).exe"
## [102] "GitHubSetup (2).exe"
## [103] "GitHubSetup.exe"
## [104] "GOMAUDIOGLOBALSETUP.EXE"
## [105] "Hdma.csv"
## [106] "Hedonic.csv"
## [107] "HP Downloads"
## [108] "HPSupportSolutionsFramework-12.5.32.203.exe"
## [109] "image.png"
## [110] "IMS PROSCHOOL Workshop.pptx.pdf"
## [111] "IMS PROSCHOOL Workshop.pptx.pptx"
## [112] "internship.docx"
## [113] "Introduction to SAS (1).pdf"
## [114] "Introduction to SAS Part 1 (1).pdf"
## [115] "Introduction to SAS Part 1.pdf"
## [116] "Introduction to SAS.pdf"
## [117] "introductory+python.ipynb"
## [118] "Invoice for Digital Vidya.pdf"
## [119] "Invoice for Weekendr.pdf"
## [120] "Invoice format - Ajay Ohri CONTATA (1).xls"
## [121] "Invoice format - Ajay Ohri CONTATA.xls"
## [122] "invoice rapid miner.pdf"
## [123] "Invoice trafla format.docx"
## [124] "iris2 (1).ipynb"
## [125] "iris2 (2).ipynb"
## [126] "iris2.ipynb"
## [127] "January invoice Indicus .pdf"
## [128] "June AV Invoice - Sheet1.pdf"
## [129] "Lecture 6 - KNN & Naive Bayes.ppt"
## [130] "Local Disk (C) - Shortcut.lnk"
## [131] "logistic regression - script for ppt.R"
## [132] "logistic_regression_-_script_for_ppt.html"
## [133] "lyncentry.exe"
## [134] "Machine+Learning++Part+1 (1).ipynb"
## [135] "Machine+Learning++Part+1.ipynb"
## [136] "March invoice Indicus - Sheet1.pdf"
## [137] "matplotlib+cars.ipynb"
## [138] "matplotlib+line+graph.ipynb"
## [139] "mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi"
## [140] "mongodb-win32-x86_64-3.4.2-signed.msi"
## [141] "mortDefault"
## [142] "mortDefault.zip"
## [143] "mtcarslm.R"
## [144] "multiple+file+concat+in+pandas (1).ipynb"
## [145] "multiple+file+concat+in+pandas.ipynb"
## [146] "my+first+class+in+python.ipynb"
## [147] "nltk.ipynb"
## [148] "notebook-Copy1.html"
## [149] "Offer Letter - Ajay Ohri (1).pdf"
## [150] "Offer Letter - Ajay Ohri.pdf"
## [151] "Other Data Mining Methods (1).pdf"
## [152] "Other Data Mining Methods.pdf"
## [153] "output1 (1).xls"
## [154] "output1 (2).xls"
## [155] "output1.xls"
## [156] "pandas+11.ipynb"
## [157] "pandas+analysis+1.ipynb"
## [158] "pandas+data+manipulation.ipynb"
## [159] "passport image.pdf"
## [160] "Pawconinvoice2016.pdf"
## [161] "Pawconinvoice2017 (1).pdf"
## [162] "Pawconinvoice2017 (2).pdf"
## [163] "Pawconinvoice2017 (3).pdf"
## [164] "Pawconinvoice2017.pdf"
## [165] "Payslip Feb 2016 - Sheet1.pdf"
## [166] "Payslip Feb 2016.pdf"
## [167] "Payslip Format Decisionstats - Sheet1.pdf"
## [168] "Payslip Jan 2016 - Sheet1.pdf"
## [169] "Payslip Jan 2016.pdf"
## [170] "Payslip March 2016 - Sheet1.pdf"
## [171] "Payslip March 2016.pdf"
## [172] "pgd.csv"
## [173] "postgresql-9.6.1-1-windows-x64.exe"
## [174] "Program 1-results.rtf"
## [175] "protein.csv"
## [176] "Python.docx"
## [177] "python+with+postgres (1).ipynb"
## [178] "python+with+postgres.ipynb"
## [179] "R-3.3.2-win.exe"
## [180] "R-3.3.3-win.exe"
## [181] "RCertificationExam.pdf"
## [182] "reg+model.ipynb"
## [183] "Revision - Business Analytics (1).pdf"
## [184] "Revision - Business Analytics.pdf"
## [185] "RidingMowers.csv"
## [186] "rsconnect"
## [187] "RStudio-1.0.136.exe"
## [188] "Salary Slip, Feb 2016.pdf"
## [189] "Salary Slip, Jan 2016.pdf"
## [190] "Salary Slip, March 2016 (1).pdf"
## [191] "Salary Slip, March 2016 (2).pdf"
## [192] "Salary Slip, March 2016.pdf"
## [193] "sales-of-shampoo-over-a-three-ye.csv"
## [194] "sas-university-edition-107140.pdf"
## [195] "SAS part 2.pdf"
## [196] "SAS Part 3.pdf"
## [197] "Scan0095.pdf"
## [198] "Scanned Invoice for Collabera.pdf"
## [199] "Screenshot 2017-01-23 12.36.55.png"
## [200] "September invoice adaptive analytics - Sheet1.pdf"
## [201] "simple+matplot+graph.ipynb"
## [202] "Sollers January.pdf"
## [203] "sqlalchemy.ipynb"
## [204] "stackoverflow-dump-analysis.html"
## [205] "Sunstone.pdf"
## [206] "Tableau.pdf"
## [207] "TableauPublicDesktop-64bit-10-1-3.exe"
## [208] "TableauPublicDesktop-64bit-10-1-4.exe"
## [209] "telecom.csv"
## [210] "TelecomServiceProviderCaseStudy.pdf"
## [211] "test+web+scraping.ipynb"
## [212] "Text Mining (1).pdf"
## [213] "Text Mining.pdf"
## [214] "third.sas7bdat"
## [215] "Time Series Forecasting (1).pdf"
## [216] "Time Series Forecasting.pdf"
## [217] "ts.html"
## [218] "ts.R"
## [219] "Unconfirmed 373974.crdownload"
## [220] "Unconfirmed 376991.crdownload"
## [221] "Unconfirmed 950045.crdownload"
## [222] "uTorrent.exe"
## [223] "VirtualBox-5.1.8-111374-Win (1).exe"
## [224] "VirtualBox-5.1.8-111374-Win.exe"
## [225] "visualcppbuildtools_full.exe"
## [226] "Web+Scraping+Yelp+with+Beautiful+Soup.ipynb"
## [227] "Webinar for Business Analytics.pdf"
## [228] "WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg"
## [229] "WhatsApp Image 2017-02-18 at 08.42.55.jpeg"
dir(pattern = 'csv')
## [1] "AirPassengers.csv"
## [2] "BigDiamonds.csv"
## [3] "BigDiamonds.csv (1).zip"
## [4] "BigDiamonds.csv (2)"
## [5] "BigDiamonds.csv (2).zip"
## [6] "BigDiamonds.csv (3).zip"
## [7] "BigDiamonds.csv.zip"
## [8] "Boston (1).csv"
## [9] "Boston.csv"
## [10] "ccFraud.csv"
## [11] "class2.csv"
## [12] "data1.csv"
## [13] "datasets.csv"
## [14] "Diamond (1).csv"
## [15] "Diamond (2).csv"
## [16] "Diamond (3).csv"
## [17] "Diamond (4).csv"
## [18] "Diamond (5).csv"
## [19] "Diamond (6).csv"
## [20] "Diamond (7).csv"
## [21] "Diamond (8).csv"
## [22] "Diamond.csv"
## [23] "Hdma.csv"
## [24] "Hedonic.csv"
## [25] "pgd.csv"
## [26] "protein.csv"
## [27] "RidingMowers.csv"
## [28] "sales-of-shampoo-over-a-three-ye.csv"
## [29] "telecom.csv"
#diamonds=read.csv("BigDiamonds.csv")
setwd("C:\\Users\\Dell\\Downloads\\BigDiamonds.csv")
dir()
## [1] "BigDiamonds.csv"
diamonds=read.csv("BigDiamonds.csv")
#install.packages("data.table")
library(data.table)
diamonds2=fread("BigDiamonds.csv")
##
Read 18.4% of 598024 rows
Read 30.1% of 598024 rows
Read 41.8% of 598024 rows
Read 53.5% of 598024 rows
Read 65.2% of 598024 rows
Read 76.9% of 598024 rows
Read 88.6% of 598024 rows
Read 98.7% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:10
str(diamonds)
## 'data.frame': 598024 obs. of 13 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ carat : num 0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
## $ cut : Factor w/ 3 levels "Good","Ideal",..: 3 1 1 3 3 1 1 3 3 1 ...
## $ color : Factor w/ 9 levels "D","E","F","G",..: 8 4 7 1 8 4 4 1 8 3 ...
## $ clarity : Factor w/ 9 levels "I1","I2","IF",..: 1 1 2 1 1 5 5 1 5 4 ...
## $ table : num 59 61 58 60 59 60 63 61 57.5 65 ...
## $ depth : num 63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
## $ cert : Factor w/ 9 levels "AGS","EGL","EGL Intl.",..: 6 6 6 6 2 6 6 6 8 6 ...
## $ measurements: Factor w/ 241453 levels ""," 3.99 x 3.95 x 2.44",..: 19960 21917 48457 15701 37341 14661 14400 19642 17115 16177 ...
## $ price : int NA NA NA NA NA NA NA NA NA NA ...
## $ x : num 3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
## $ y : num 3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
## $ z : num 2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
str(diamonds2)
## Classes 'data.table' and 'data.frame': 598024 obs. of 13 variables:
## $ V1 : chr "1" "2" "3" "4" ...
## $ carat : num 0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
## $ cut : chr "V.Good" "Good" "Good" "V.Good" ...
## $ color : chr "K" "G" "J" "D" ...
## $ clarity : chr "I1" "I1" "I2" "I1" ...
## $ table : num 59 61 58 60 59 60 63 61 57.5 65 ...
## $ depth : num 63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
## $ cert : chr "GIA" "GIA" "GIA" "GIA" ...
## $ measurements: chr "3.96 x 3.95 x 2.52" "4.00 x 4.05 x 2.30" "4.56 x 4.53 x 2.67" "3.80 x 3.82 x 2.31" ...
## $ price : int NA NA NA NA NA NA NA NA NA NA ...
## $ x : num 3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
## $ y : num 3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
## $ z : num 2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
## - attr(*, ".internal.selfref")=<externalptr>
tables()
## NAME NROW NCOL MB
## [1,] diamonds2 598,024 13 101
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## KEY
## [1,]
## Total: 101MB
setkey(diamonds2,color)
tables()
## NAME NROW NCOL MB
## [1,] diamonds2 598,024 13 101
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## KEY
## [1,] color
## Total: 101MB
head(diamonds)
## X carat cut color clarity table depth cert measurements price
## 1 1 0.25 V.Good K I1 59 63.7 GIA 3.96 x 3.95 x 2.52 NA
## 2 2 0.23 Good G I1 61 58.1 GIA 4.00 x 4.05 x 2.30 NA
## 3 3 0.34 Good J I2 58 58.7 GIA 4.56 x 4.53 x 2.67 NA
## 4 4 0.21 V.Good D I1 60 60.6 GIA 3.80 x 3.82 x 2.31 NA
## 5 5 0.31 V.Good K I1 59 62.2 EGL 4.35 x 4.26 x 2.68 NA
## 6 6 0.20 Good G SI2 60 64.4 GIA 3.74 x 3.67 x 2.38 NA
## x y z
## 1 3.96 3.95 2.52
## 2 4.00 4.05 2.30
## 3 4.56 4.53 2.67
## 4 3.80 3.82 2.31
## 5 4.35 4.26 2.68
## 6 3.74 3.67 2.38
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#diamonds3=select(price,color,diamonds)
#?select
diamonds3=select(diamonds,price,color,cut)
#diamonds4=diamonds[color=="J",]
diamonds4=diamonds3[diamonds3$color=="J",]
summary(diamonds4)
## price color cut
## Min. : 300 J :48709 Good : 5357
## 1st Qu.: 1575 D : 0 Ideal :29440
## Median : 4697 E : 0 V.Good:13912
## Mean : 9424 F : 0
## 3rd Qu.:11352 G : 0
## Max. :99806 H : 0
## NA's :64 (Other): 0
mean(diamonds4$price)
## [1] NA
mean(diamonds4$price,na.rm=T)
## [1] 9423.581
dim(diamonds)
## [1] 598024 13
diamonds=na.omit(diamonds)
dim(diamonds)
## [1] 593784 13
is.numeric(24)
## [1] TRUE
#is.na(23,45,56,NA,NA)
#ajay=(23,45,56,NA,NA)
ajay = c(23, 45, 56,78,NA,NA)
is.na(ajay)
## [1] FALSE FALSE FALSE FALSE TRUE TRUE
table(is.na(ajay))
##
## FALSE TRUE
## 4 2
table(is.na(diamonds4))
##
## FALSE TRUE
## 146063 64
head(diamonds4)
## price color cut
## 3 NA J Good
## 16 NA J V.Good
## 35 NA J V.Good
## 51 NA J Good
## 56 NA J V.Good
## 57 NA J Good
M=mean(diamonds4$price,na.rm=T)
#diamonds4$price=ifelse(is.na,M,diamonds4$price)
#diamonds4$price=ifelse(is.na(),M,diamonds4$price)
diamonds4$price=ifelse(is.na(diamonds4$price),M,diamonds4$price)
head(diamonds4)
## price color cut
## 3 9423.581 J Good
## 16 9423.581 J V.Good
## 35 9423.581 J V.Good
## 51 9423.581 J Good
## 56 9423.581 J V.Good
## 57 9423.581 J Good
M
## [1] 9423.581
diamonds4$price=ifelse(is.na(diamonds4$price),M,diamonds4$price)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked _by_ '.GlobalEnv':
##
## diamonds
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
attach(diamonds3)
summarize(price,cut,mean)
## cut price
## 1 Good NA
## 2 Ideal NA
## 3 V.Good NA
names(diamonds)
## [1] "X" "carat" "cut" "color"
## [5] "clarity" "table" "depth" "cert"
## [9] "measurements" "price" "x" "y"
## [13] "z"
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
sqldf("select avg(price) as Average_price,cut from diamonds3 group by cut")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
## Average_price cut
## 1 5254.792 Good
## 2 9919.277 Ideal
## 3 7430.527 V.Good
#summarize(diamonds$price,diamonds$cut,min)
summary(diamonds)
## X carat cut color
## Min. : 494 Min. :0.200 Good : 59149 G :95363
## 1st Qu.:149638 1st Qu.:0.500 Ideal :367026 E :92859
## Median :299312 Median :0.900 V.Good:167609 F :92772
## Mean :299221 Mean :1.073 H :85951
## 3rd Qu.:448775 3rd Qu.:1.500 D :73201
## Max. :598024 Max. :9.250 I :69879
## (Other):83759
## clarity table depth cert
## SI1 :115898 Min. : 0.00 Min. : 0.00 GIA :460036
## VS2 :110402 1st Qu.:56.00 1st Qu.:61.00 IGI : 43339
## SI2 :103671 Median :58.00 Median :62.00 EGL : 33722
## VS1 : 97113 Mean :57.66 Mean :61.09 EGL USA : 16019
## VVS2 : 65002 3rd Qu.:59.00 3rd Qu.:62.70 EGL Intl. : 11439
## VVS1 : 54284 Max. :75.90 Max. :81.30 EGL ISRAEL: 11285
## (Other): 47414 (Other) : 17944
## measurements price x
## 4.3 x 4.27 x 2.67 : 97 Min. : 300 Min. : 0.150
## 4.31 x 4.29 x 2.68: 87 1st Qu.: 1218 1st Qu.: 4.740
## 4.29 x 4.26 x 2.67: 86 Median : 3503 Median : 5.780
## 4.3 x 4.28 x 2.67 : 84 Mean : 8756 Mean : 5.992
## 4.3 x 4.28 x 2.68 : 83 3rd Qu.:11186 3rd Qu.: 6.970
## 4.29 x 4.26 x 2.66: 80 Max. :99990 Max. :13.890
## (Other) :593267
## y z
## Min. : 1.000 Min. : 0.040
## 1st Qu.: 4.970 1st Qu.: 3.120
## Median : 6.050 Median : 3.860
## Mean : 6.201 Mean : 4.036
## 3rd Qu.: 7.230 3rd Qu.: 4.610
## Max. :13.890 Max. :13.180
##
diamonds$pricepercarat=diamonds$price/diamonds$carat
summary(diamonds)
## X carat cut color
## Min. : 494 Min. :0.200 Good : 59149 G :95363
## 1st Qu.:149638 1st Qu.:0.500 Ideal :367026 E :92859
## Median :299312 Median :0.900 V.Good:167609 F :92772
## Mean :299221 Mean :1.073 H :85951
## 3rd Qu.:448775 3rd Qu.:1.500 D :73201
## Max. :598024 Max. :9.250 I :69879
## (Other):83759
## clarity table depth cert
## SI1 :115898 Min. : 0.00 Min. : 0.00 GIA :460036
## VS2 :110402 1st Qu.:56.00 1st Qu.:61.00 IGI : 43339
## SI2 :103671 Median :58.00 Median :62.00 EGL : 33722
## VS1 : 97113 Mean :57.66 Mean :61.09 EGL USA : 16019
## VVS2 : 65002 3rd Qu.:59.00 3rd Qu.:62.70 EGL Intl. : 11439
## VVS1 : 54284 Max. :75.90 Max. :81.30 EGL ISRAEL: 11285
## (Other): 47414 (Other) : 17944
## measurements price x
## 4.3 x 4.27 x 2.67 : 97 Min. : 300 Min. : 0.150
## 4.31 x 4.29 x 2.68: 87 1st Qu.: 1218 1st Qu.: 4.740
## 4.29 x 4.26 x 2.67: 86 Median : 3503 Median : 5.780
## 4.3 x 4.28 x 2.67 : 84 Mean : 8756 Mean : 5.992
## 4.3 x 4.28 x 2.68 : 83 3rd Qu.:11186 3rd Qu.: 6.970
## 4.29 x 4.26 x 2.66: 80 Max. :99990 Max. :13.890
## (Other) :593267
## y z pricepercarat
## Min. : 1.000 Min. : 0.040 Min. : 525
## 1st Qu.: 4.970 1st Qu.: 3.120 1st Qu.: 2667
## Median : 6.050 Median : 3.860 Median : 4173
## Mean : 6.201 Mean : 4.036 Mean : 5789
## 3rd Qu.: 7.230 3rd Qu.: 4.610 3rd Qu.: 7437
## Max. :13.890 Max. :13.180 Max. :49519
##
summarize(diamonds$pricepercarat,diamonds$color,max)
## diamonds$color diamonds$pricepercarat
## 1 D 49519.40
## 2 E 40871.01
## 3 F 37084.06
## 4 G 32998.33
## 5 H 31718.95
## 6 I 24982.45
## 7 J 22890.12
## 8 K 19516.77
## 9 L 14585.41
summarize(diamonds$pricepercarat,diamonds$color,median)
## diamonds$color diamonds$pricepercarat
## 1 D 4114.000
## 2 E 3836.000
## 3 F 4124.876
## 4 G 4314.286
## 5 H 4551.111
## 6 I 4514.706
## 7 J 4330.348
## 8 K 3851.258
## 9 L 2800.000
a=summarize(diamonds$pricepercarat,diamonds$clarity,min)
summarize(diamonds$pricepercarat,diamonds$color,min)
## diamonds$color diamonds$pricepercarat
## 1 D 603.8136
## 2 E 604.7809
## 3 F 671.1111
## 4 G 654.2373
## 5 H 660.8696
## 6 I 525.0000
## 7 J 630.6122
## 8 K 637.6812
## 9 L 683.3333
boxplot(diamonds$pricepercarat~diamonds$clarity)

library(data.table)
diamonds3=data.table(diamonds)
diamonds3[,max(pricepercarat),color]
## color V1
## 1: G 32998.33
## 2: K 19516.77
## 3: J 22890.12
## 4: H 31718.95
## 5: F 37084.06
## 6: I 24982.45
## 7: D 49519.40
## 8: E 40871.01
## 9: L 14585.41
diamonds3[,max(pricepercarat),cut]
## cut V1
## 1: V.Good 49519.40
## 2: Good 43410.00
## 3: Ideal 49481.59
diamonds3[,.(.N,mean(price),max(pricepercarat)),.(cut,color)]
## cut color N V2 V3
## 1: V.Good G 24990 8086.758 32670.72
## 2: V.Good K 7580 8882.727 19516.77
## 3: Good J 5316 5614.735 19881.67
## 4: Ideal G 61569 9834.169 32998.33
## 5: Good H 7542 6149.439 24412.53
## 6: Good F 9042 5076.173 33099.67
## 7: V.Good I 19761 8019.495 22702.73
## 8: Ideal I 42779 10922.554 24982.45
## 9: V.Good D 21460 6430.541 49519.40
## 10: V.Good H 22821 8116.492 28426.67
## 11: Good I 7339 5543.148 19340.26
## 12: V.Good E 28016 6272.180 33876.97
## 13: V.Good F 26027 7001.099 32670.86
## 14: Ideal H 55588 11195.620 31718.95
## 15: Good E 9623 4182.597 33196.01
## 16: Good G 8804 5660.256 29395.68
## 17: Ideal F 57703 9294.957 37084.06
## 18: Ideal K 14631 11063.548 18615.38
## 19: V.Good J 13840 8384.011 19616.40
## 20: Ideal E 55220 8354.683 40871.01
## 21: Good D 6566 4656.203 43410.00
## 22: Good K 3449 5760.076 18257.46
## 23: Ideal L 5039 8024.461 14280.08
## 24: V.Good L 3114 6550.520 14585.41
## 25: Ideal J 29322 10598.306 22890.12
## 26: Ideal D 45175 9669.691 49481.59
## 27: Good L 1468 5158.339 13622.38
## cut color N V2 V3
library(sqldf)
head(diamonds3)
## X carat cut color clarity table depth cert measurements
## 1: 494 0.24 V.Good G SI1 61.0 58.9 GIA 4.09 x 4.10 x 2.41
## 2: 495 0.31 V.Good K SI2 59.0 60.2 GIA 4.40 x 4.42 x 2.65
## 3: 496 0.26 Good J VS2 56.5 64.1 IGI 4.01 x 4.05 x 2.58
## 4: 497 0.24 Ideal G SI1 55.0 61.3 GIA 4.01 x 4.03 x 2.47
## 5: 498 0.30 Good H I1 57.0 62.2 GIA 4.21 x 4.24 x 2.63
## 6: 499 0.34 Good F I1 66.0 55.0 GIA 4.75 x 4.61 x 2.57
## price x y z pricepercarat
## 1: 300 4.09 4.10 2.41 1250.0000
## 2: 300 4.40 4.42 2.65 967.7419
## 3: 300 4.01 4.05 2.58 1153.8462
## 4: 300 4.01 4.03 2.47 1250.0000
## 5: 300 4.21 4.24 2.63 1000.0000
## 6: 300 4.75 4.61 2.57 882.3529
diamonds$X=NULL
sqldf("select max(pricepercarat),color
from
diamonds group by color")
## max(pricepercarat) color
## 1 49519.40 D
## 2 40871.01 E
## 3 37084.06 F
## 4 32998.33 G
## 5 31718.95 H
## 6 24982.45 I
## 7 22890.12 J
## 8 19516.77 K
## 9 14585.41 L
library(nycflights13)
dim(flights)
## [1] 336776 19
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
#
# filter() (and slice())
# arrange()
# select() (and rename())
# distinct()
# mutate() (and transmute())
# summarise()
# sample_n() (and sample_frac())
filter(flights,month==1,day==1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
d=slice(flights,1:100)
arrange(d, year, month, day)
## # A tibble: 100 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 90 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
f=select(flights,year,month,day)
summary(f)
## year month day
## Min. :2013 Min. : 1.000 Min. : 1.00
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00
## Median :2013 Median : 7.000 Median :16.00
## Mean :2013 Mean : 6.549 Mean :15.71
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00
## Max. :2013 Max. :12.000 Max. :31.00
rename(f,The_Day=day)
## # A tibble: 336,776 × 3
## year month The_Day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
distinct(flights,day)
## # A tibble: 31 × 1
## day
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
## # ... with 21 more rows
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
distinct(flights,origin)
## # A tibble: 3 × 1
## origin
## <chr>
## 1 EWR
## 2 LGA
## 3 JFK
flights2=mutate(flights,
speed = distance / air_time * 60)
summary(flights2$speed)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 76.8 358.1 404.2 394.3 438.8 703.4 9430
summarise(flights2,delay=mean(dep_delay,na.rm=T))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.63907
mean(flights2$dep_delay,na.rm = T)
## [1] 12.63907
sample_n(flights,10)
## # A tibble: 10 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 8 3 1828 1830 -2 2118
## 2 2013 11 17 1736 1512 144 1929
## 3 2013 8 16 2232 2059 93 1
## 4 2013 10 14 839 843 -4 942
## 5 2013 8 6 1806 1800 6 1940
## 6 2013 10 1 1756 1800 -4 1904
## 7 2013 7 11 1654 1700 -6 1815
## 8 2013 7 15 731 730 1 1015
## 9 2013 4 26 1157 1135 22 1413
## 10 2013 3 23 1649 1652 -3 1908
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
sample_frac(flights,0.001)
## # A tibble: 337 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 8 21 1250 1250 0 1424
## 2 2013 11 13 1715 1706 9 1819
## 3 2013 6 19 1312 1300 12 1421
## 4 2013 6 27 717 720 -3 907
## 5 2013 7 3 1831 1731 60 2015
## 6 2013 5 23 1828 1830 -2 2122
## 7 2013 1 4 928 919 9 1039
## 8 2013 10 2 1459 1455 4 1745
## 9 2013 12 26 2104 2035 29 6
## 10 2013 10 3 1352 1400 -8 1508
## # ... with 327 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
by_origin=group_by(flights,origin)
delay = summarise(by_origin,mean(arr_delay, na.rm = TRUE))
delay
## # A tibble: 3 × 2
## origin `mean(arr_delay, na.rm = TRUE)`
## <chr> <dbl>
## 1 EWR 9.107055
## 2 JFK 5.551481
## 3 LGA 5.783488
#class exercise
names(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
test=select(flights,origin,dep_delay,arr_delay)
summary(test)
## origin dep_delay arr_delay
## Length:336776 Min. : -43.00 Min. : -86.000
## Class :character 1st Qu.: -5.00 1st Qu.: -17.000
## Mode :character Median : -2.00 Median : -5.000
## Mean : 12.64 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.: 14.000
## Max. :1301.00 Max. :1272.000
## NA's :8255 NA's :9430
table(test$origin)
##
## EWR JFK LGA
## 120835 111279 104662
attach(test)
boxplot(arr_delay)

describe(arr_delay)
## arr_delay
## n missing distinct Info Mean Gmd .05 .10
## 327346 9430 577 1 6.895 39.23 -32 -26
## .25 .50 .75 .90 .95
## -17 -5 14 52 91
##
## lowest : -86 -79 -75 -74 -73, highest: 989 1007 1109 1127 1272
test_arr=filter(test,arr_delay>0)
test_dep=filter(test,dep_delay>0)
names(test_arr)
## [1] "origin" "dep_delay" "arr_delay"
gtest_arr=group_by(test_arr,origin)
gtest_dep=group_by(test_dep,origin)
summarise(gtest_arr,count=n(),
delay=mean(arr_delay,na.rm = T),
sum=sum(arr_delay,na.rm = T)/(24*60))
## # A tibble: 3 × 4
## origin count delay sum
## <chr> <int> <dbl> <dbl>
## 1 EWR 50099 41.75307 1452.630
## 2 JFK 42885 40.00993 1191.546
## 3 LGA 40020 38.93306 1082.015
summarise(gtest_dep,count=n(),
delay=mean(dep_delay,na.rm = T),
sum=sum(dep_delay,na.rm = T)/24*60)
## # A tibble: 3 × 4
## origin count delay sum
## <chr> <int> <dbl> <dbl>
## 1 EWR 52711 38.98792 5137730
## 2 JFK 42031 38.04677 3997860
## 3 LGA 33690 41.63096 3506368
glights=group_by(flights,origin)
summarise(glights,count=n())
## # A tibble: 3 × 2
## origin count
## <chr> <int>
## 1 EWR 120835
## 2 JFK 111279
## 3 LGA 104662