ls()
## character(0)
memory.size()
## [1] 14.29
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 290908 7.8 592000 15.9 350000 9.4
## Vcells 321510 2.5 786432 6.0 677388 5.2
getwd()
## [1] "C:/Users/dell/Desktop"
setwd("C:/Users/dell/Desktop")
dir()
## [1] "1.png"
## [2] "2.png"
## [3] "5128OS_09_01.jpg"
## [4] "adult.data.txt"
## [5] "airline.sas7bdat"
## [6] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [7] "basicR.html"
## [8] "basicR.R"
## [9] "basicR.spin.R"
## [10] "basicR.spin.Rmd"
## [11] "BigDiamonds.csv"
## [12] "BigDiamonds.csv.zip"
## [13] "Boston.csv"
## [14] "Cars.sav"
## [15] "casestudy"
## [16] "ccFraud.csv"
## [17] "Certificate Doc.docx"
## [18] "CmapServer Download _ Cmap.html"
## [19] "CmapServer Download _ Cmap_files"
## [20] "Coxcombs.jpg"
## [21] "cricketparsing.R"
## [22] "data input.R"
## [23] "data_input.html"
## [24] "datatable"
## [25] "day8 session 4.fbr"
## [26] "day9 session 1.fbr"
## [27] "desktop.ini"
## [28] "Dropbox.lnk"
## [29] "exam.html"
## [30] "exam.R"
## [31] "ie_data.xls"
## [32] "lastsave.txt"
## [33] "lastsave2"
## [34] "library.docx"
## [35] "Minard.png"
## [36] "modules"
## [37] "modules.zip"
## [38] "my first code.R"
## [39] "mycode.docx"
## [40] "mycode.html"
## [41] "mycode.R"
## [42] "myfirstRcode.R"
## [43] "New folder"
## [44] "new1"
## [45] "Quiz 1 R.docx"
## [46] "rfmanalysis2.html"
## [47] "rfmanalysis2.R"
## [48] "rsconnect"
## [49] "SnowMap_Points.png"
## [50] "test.csv"
## [51] "Untitled (3).wma"
## [52] "Untitled (3).wma.wav"
## [53] "Untitled 88.wma"
library(data.table)
BigDiamonds <- fread("C:/Users/dell/Desktop/BigDiamonds.csv")
##
Read 21.7% of 598024 rows
Read 45.1% of 598024 rows
Read 56.9% of 598024 rows
Read 66.9% of 598024 rows
Read 88.6% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:08
str(BigDiamonds)
## Classes 'data.table' and 'data.frame': 598024 obs. of 13 variables:
## $ V1 : chr "1" "2" "3" "4" ...
## $ carat : num 0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
## $ cut : chr "V.Good" "Good" "Good" "V.Good" ...
## $ color : chr "K" "G" "J" "D" ...
## $ clarity : chr "I1" "I1" "I2" "I1" ...
## $ table : num 59 61 58 60 59 60 63 61 57.5 65 ...
## $ depth : num 63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
## $ cert : chr "GIA" "GIA" "GIA" "GIA" ...
## $ measurements: chr "3.96 x 3.95 x 2.52" "4.00 x 4.05 x 2.30" "4.56 x 4.53 x 2.67" "3.80 x 3.82 x 2.31" ...
## $ price : int NA NA NA NA NA NA NA NA NA NA ...
## $ x : num 3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
## $ y : num 3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
## $ z : num 2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(BigDiamonds)
## [1] 598024 13
class(BigDiamonds)
## [1] "data.table" "data.frame"
nrow(BigDiamonds)
## [1] 598024
ncol(BigDiamonds)
## [1] 13
summary(BigDiamonds)
## V1 carat cut color
## Length:598024 Min. :0.200 Length:598024 Length:598024
## Class :character 1st Qu.:0.500 Class :character Class :character
## Mode :character Median :0.900 Mode :character Mode :character
## Mean :1.071
## 3rd Qu.:1.500
## Max. :9.250
##
## clarity table depth cert
## Length:598024 Min. : 0.00 Min. : 0.00 Length:598024
## Class :character 1st Qu.:56.00 1st Qu.:61.00 Class :character
## Mode :character Median :58.00 Median :62.10 Mode :character
## Mean :57.63 Mean :61.06
## 3rd Qu.:59.00 3rd Qu.:62.70
## Max. :75.90 Max. :81.30
##
## measurements price x y
## Length:598024 Min. : 300 Min. : 0.150 Min. : 1.000
## Class :character 1st Qu.: 1220 1st Qu.: 4.740 1st Qu.: 4.970
## Mode :character Median : 3503 Median : 5.780 Median : 6.050
## Mean : 8753 Mean : 5.991 Mean : 6.199
## 3rd Qu.:11174 3rd Qu.: 6.970 3rd Qu.: 7.230
## Max. :99990 Max. :13.890 Max. :13.890
## NA's :713 NA's :1815 NA's :1852
## z
## Min. : 0.040
## 1st Qu.: 3.120
## Median : 3.860
## Mean : 4.033
## 3rd Qu.: 4.610
## Max. :13.180
## NA's :2544
summary(BigDiamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 300 1220 3503 8753 11170 99990 713
names(BigDiamonds)
## [1] "V1" "carat" "cut" "color"
## [5] "clarity" "table" "depth" "cert"
## [9] "measurements" "price" "x" "y"
## [13] "z"
#Average carat size
mean(BigDiamonds$carat)
## [1] 1.071297
mean(BigDiamonds$price)
## [1] NA
mean(BigDiamonds$price,na.rm = T)#ignore missing
## [1] 8753.018
length(BigDiamonds$price)
## [1] 598024
BigDiamonds2=na.omit(BigDiamonds) #delete missing
dim(BigDiamonds2)
## [1] 593784 13
table(is.na(BigDiamonds$price))
##
## FALSE TRUE
## 597311 713
library(magrittr)
BigDiamonds$price %>%
is.na %>%
table %>%
pie
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units

mean(BigDiamonds2$carat)
## [1] 1.072593
summarize(BigDiamonds2$carat,BigDiamonds2$color,mean)
## BigDiamonds2$color BigDiamonds2$carat
## 1 D 0.8274407
## 2 E 0.8331336
## 3 F 0.9424257
## 4 G 1.0658902
## 5 H 1.2106406
## 6 I 1.2720231
## 7 J 1.3484265
## 8 K 1.4984115
## 9 L 1.3632803
summarize(BigDiamonds2$price,BigDiamonds2$cut,mean)
## BigDiamonds2$cut BigDiamonds2$price
## 1 Good 5256.226
## 2 Ideal 9924.824
## 3 V.Good 7430.927
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
sqldf("select cut,avg(price) from BigDiamonds2 group by cut")
## Loading required package: tcltk
## cut avg(price)
## 1 Good 5256.226
## 2 Ideal 9924.824
## 3 V.Good 7430.927
BigDiamonds[,mean(price,na.rm=T),cut]
## cut V1
## 1: V.Good 7430.527
## 2: Good 5254.792
## 3: Ideal 9919.277
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summarize(group_by(BigDiamonds,cut),mean(price,na.rm=T))
## Source: local data table [3 x 2]
##
## cut mean(price, na.rm = T)
## 1 V.Good 7430.527
## 2 Good 5254.792
## 3 Ideal 9919.277