rm(list=ls())
To enable PDF render execute the following (make sure to put into R-chunk)
tinytex::install_tinytex()
library(psych)
describe(iris) #this will give summary of all variables
## vars n mean sd median trimmed mad min max range skew
## Sepal.Length 1 150 5.84 0.83 5.80 5.81 1.04 4.3 7.9 3.6 0.31
## Sepal.Width 2 150 3.06 0.44 3.00 3.04 0.44 2.0 4.4 2.4 0.31
## Petal.Length 3 150 3.76 1.77 4.35 3.76 1.85 1.0 6.9 5.9 -0.27
## Petal.Width 4 150 1.20 0.76 1.30 1.18 1.04 0.1 2.5 2.4 -0.10
## Species* 5 150 2.00 0.82 2.00 2.00 1.48 1.0 3.0 2.0 0.00
## kurtosis se
## Sepal.Length -0.61 0.07
## Sepal.Width 0.14 0.04
## Petal.Length -1.42 0.14
## Petal.Width -1.36 0.06
## Species* -1.52 0.07
describe(iris$Sepal.Length) #this will give summary of one variable
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 150 5.84 0.83 5.8 5.81 1.04 4.3 7.9 3.6 0.31 -0.61 0.07
Histogram
hist(iris$Sepal.Length)
barplot(table(iris$Species))
plot(iris$Sepal.Length~iris$Sepal.Width)
plot(iris$Sepal.Length~iris$Sepal.Width, col=iris$Species)
legend("topright", legend=unique(iris$Species), fill=unique(iris$Species))
library(readxl)
Avocado_with_notes <- read_excel("C:/Users/atomi/Desktop/Micro/Avocado_with notes.xlsx",sheet = "Avocado")
## New names:
## • `` -> `...1`
View(Avocado_with_notes)
Avocado<-Avocado_with_notes
rm(Avocado_with_notes)
str(Avocado)
## tibble [18,249 × 14] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:18249] 0 1 2 3 4 5 6 7 8 9 ...
## $ Date : POSIXct[1:18249], format: "2015-12-27" "2015-12-20" ...
## $ AveragePrice: num [1:18249] 1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
## $ Total Volume: num [1:18249] 64237 54877 118220 78992 51040 ...
## $ 4046 : num [1:18249] 1037 674 795 1132 941 ...
## $ 4225 : num [1:18249] 54455 44639 109150 71976 43838 ...
## $ 4770 : num [1:18249] 48.2 58.3 130.5 72.6 75.8 ...
## $ Total Bags : num [1:18249] 8697 9506 8145 5811 6184 ...
## $ Small Bags : num [1:18249] 8604 9408 8042 5677 5986 ...
## $ Large Bags : num [1:18249] 93.2 97.5 103.1 133.8 197.7 ...
## $ XLarge Bags : num [1:18249] NA NA NA NA NA NA NA NA NA NA ...
## $ type : chr [1:18249] "conventional" "conventional" "conventional" "conventional" ...
## $ year : num [1:18249] 2015 2015 2015 2015 2015 ...
## $ region : chr [1:18249] "Albany" "Albany" "Albany" "Albany" ...
library(psych)
describe(Avocado)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad
## ...1 1 18249 24.23 15.48 24.00 23.96 20.76
## Date 2 18249 NaN NA NA NaN NA
## AveragePrice 3 18249 1.41 0.40 1.37 1.38 0.42
## Total Volume 4 18249 850644.01 3453545.36 107376.76 232479.38 152652.16
## 4046 5 18249 293008.42 1264989.08 8645.30 58604.80 12775.10
## 4225 6 18249 295154.57 1204120.40 29061.02 80079.89 42285.68
## 4770 7 18249 22839.74 107464.07 184.99 3375.02 274.27
## Total Bags 8 18249 239639.20 986242.40 39743.83 67480.14 55300.92
## Small Bags 9 18249 182194.69 746178.51 26362.82 49175.03 37953.80
## Large Bags 10 18249 54338.09 243965.96 2647.71 12057.05 3925.49
## XLarge Bags 11 6201 9141.94 29430.56 1130.00 2916.42 1644.87
## type* 12 18249 1.50 0.50 1.00 1.50 0.00
## year 13 18249 2016.15 0.94 2016.00 2016.10 1.48
## region* 14 18249 27.50 15.58 27.00 27.50 19.27
## min max range skew kurtosis se
## ...1 0.00 52.00 52.00 0.11 -1.25 0.11
## Date Inf -Inf -Inf NA NA NA
## AveragePrice 0.44 3.25 2.81 0.58 0.32 0.00
## Total Volume 84.56 62505646.52 62505561.96 9.01 92.07 25564.99
## 4046 0.00 22743616.17 22743616.17 8.65 86.78 9364.13
## 4225 0.00 20470572.61 20470572.61 8.94 91.91 8913.54
## 4770 0.00 2546439.11 2546439.11 10.16 132.51 795.51
## Total Bags 0.00 19373134.37 19373134.37 9.75 112.23 7300.69
## Small Bags 0.00 13384586.80 13384586.80 9.54 106.97 5523.61
## Large Bags 0.00 5719096.61 5719096.61 9.79 117.95 1805.97
## XLarge Bags 1.00 551693.65 551692.65 7.78 81.28 373.74
## type* 1.00 2.00 1.00 0.00 -2.00 0.00
## year 2015.00 2018.00 3.00 0.22 -1.03 0.01
## region* 1.00 54.00 53.00 0.00 -1.20 0.12
#check for missing values
sum(is.na(Avocado)) #check the entire dataset
## [1] 12048
sum(is.na(Avocado$`XLarge Bags`))
## [1] 12048
sum(is.na(Avocado)) #check the entire dataset
## [1] 12048
sum(is.na(Avocado$`XLarge Bags`)) #examine one variable
## [1] 12048
Avocado_impute<-Avocado
Avocado_impute$`XLarge Bags`[is.na(Avocado_impute$`XLarge Bags`)]<-median(Avocado_impute$`XLarge Bags`, na.rm=TRUE)
sum(is.na(Avocado_impute$`XLarge Bags`))
## [1] 0
summary(Avocado$`XLarge Bags`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 109.7 1130.0 9141.9 4915.0 551693.7 12048
summary(Avocado_impute$`XLarge Bags`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1130 1130 3852 1130 551694
Replace Missing with Median of the variable
Avocado$XLimputeMedian<-Avocado$`XLarge Bags`
sum(is.na(Avocado$XLimputeMedian))
## [1] 12048
Avocado$XLimputeMedian [is.na(Avocado$XLimputeMedian )]<-median(Avocado$`XLarge Bags`, na.rm=TRUE)
sum(is.na(Avocado$XLimputeMedian))
## [1] 0
#replace missing with the mean
Avocado$XLimputeMean<-Avocado$`XLarge Bags`
sum(is.na(Avocado$XLimputeMean))
## [1] 12048
Avocado$XLimputeMean [is.na(Avocado$XLimputeMean )]<-median(Avocado$`XLarge Bags`, na.rm=TRUE)
sum(is.na(Avocado$XLimputeMean))
## [1] 0
table(Avocado$region, Avocado$year)
##
## 2015 2016 2017 2018
## Albany 104 104 106 24
## Atlanta 104 104 106 24
## BaltimoreWashington 104 104 106 24
## Boise 104 104 106 24
## Boston 104 104 106 24
## BuffaloRochester 104 104 106 24
## California 104 104 106 24
## Charlotte 104 104 106 24
## Chicago 104 104 106 24
## CincinnatiDayton 104 104 106 24
## Columbus 104 104 106 24
## DallasFtWorth 104 104 106 24
## Denver 104 104 106 24
## Detroit 104 104 106 24
## GrandRapids 104 104 106 24
## GreatLakes 104 104 106 24
## HarrisburgScranton 104 104 106 24
## HartfordSpringfield 104 104 106 24
## Houston 104 104 106 24
## Indianapolis 104 104 106 24
## Jacksonville 104 104 106 24
## LasVegas 104 104 106 24
## LosAngeles 104 104 106 24
## Louisville 104 104 106 24
## MiamiFtLauderdale 104 104 106 24
## Midsouth 104 104 106 24
## Nashville 104 104 106 24
## NewOrleansMobile 104 104 106 24
## NewYork 104 104 106 24
## Northeast 104 104 106 24
## NorthernNewEngland 104 104 106 24
## Orlando 104 104 106 24
## Philadelphia 104 104 106 24
## PhoenixTucson 104 104 106 24
## Pittsburgh 104 104 106 24
## Plains 104 104 106 24
## Portland 104 104 106 24
## RaleighGreensboro 104 104 106 24
## RichmondNorfolk 104 104 106 24
## Roanoke 104 104 106 24
## Sacramento 104 104 106 24
## SanDiego 104 104 106 24
## SanFrancisco 104 104 106 24
## Seattle 104 104 106 24
## SouthCarolina 104 104 106 24
## SouthCentral 104 104 106 24
## Southeast 104 104 106 24
## Spokane 104 104 106 24
## StLouis 104 104 106 24
## Syracuse 104 104 106 24
## Tampa 104 104 106 24
## TotalUS 104 104 106 24
## West 104 104 106 24
## WestTexNewMexico 103 104 104 24
plot(Avocado$AveragePrice~Avocado$`Total Volume`)