rm(list=ls())

PDF setup

To enable PDF render execute the following (make sure to put into R-chunk)

tinytex::install_tinytex()

Describing Data

library(psych)
describe(iris) #this will give summary of all variables
##              vars   n mean   sd median trimmed  mad min max range  skew
## Sepal.Length    1 150 5.84 0.83   5.80    5.81 1.04 4.3 7.9   3.6  0.31
## Sepal.Width     2 150 3.06 0.44   3.00    3.04 0.44 2.0 4.4   2.4  0.31
## Petal.Length    3 150 3.76 1.77   4.35    3.76 1.85 1.0 6.9   5.9 -0.27
## Petal.Width     4 150 1.20 0.76   1.30    1.18 1.04 0.1 2.5   2.4 -0.10
## Species*        5 150 2.00 0.82   2.00    2.00 1.48 1.0 3.0   2.0  0.00
##              kurtosis   se
## Sepal.Length    -0.61 0.07
## Sepal.Width      0.14 0.04
## Petal.Length    -1.42 0.14
## Petal.Width     -1.36 0.06
## Species*        -1.52 0.07
describe(iris$Sepal.Length) #this will give summary of one variable
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 150 5.84 0.83    5.8    5.81 1.04 4.3 7.9   3.6 0.31    -0.61 0.07

Graphs

Histogram

hist(iris$Sepal.Length)

barplot(table(iris$Species))

plot(iris$Sepal.Length~iris$Sepal.Width)

plot(iris$Sepal.Length~iris$Sepal.Width, col=iris$Species)
legend("topright", legend=unique(iris$Species), fill=unique(iris$Species))

Avocado Example

Import Data

library(readxl)
Avocado_with_notes <- read_excel("C:/Users/atomi/Desktop/Micro/Avocado_with notes.xlsx",sheet = "Avocado")
## New names:
## • `` -> `...1`
View(Avocado_with_notes)

Avocado<-Avocado_with_notes

rm(Avocado_with_notes)

Examine Data

str(Avocado)
## tibble [18,249 × 14] (S3: tbl_df/tbl/data.frame)
##  $ ...1        : num [1:18249] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Date        : POSIXct[1:18249], format: "2015-12-27" "2015-12-20" ...
##  $ AveragePrice: num [1:18249] 1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
##  $ Total Volume: num [1:18249] 64237 54877 118220 78992 51040 ...
##  $ 4046        : num [1:18249] 1037 674 795 1132 941 ...
##  $ 4225        : num [1:18249] 54455 44639 109150 71976 43838 ...
##  $ 4770        : num [1:18249] 48.2 58.3 130.5 72.6 75.8 ...
##  $ Total Bags  : num [1:18249] 8697 9506 8145 5811 6184 ...
##  $ Small Bags  : num [1:18249] 8604 9408 8042 5677 5986 ...
##  $ Large Bags  : num [1:18249] 93.2 97.5 103.1 133.8 197.7 ...
##  $ XLarge Bags : num [1:18249] NA NA NA NA NA NA NA NA NA NA ...
##  $ type        : chr [1:18249] "conventional" "conventional" "conventional" "conventional" ...
##  $ year        : num [1:18249] 2015 2015 2015 2015 2015 ...
##  $ region      : chr [1:18249] "Albany" "Albany" "Albany" "Albany" ...
library(psych)
describe(Avocado)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##              vars     n      mean         sd    median   trimmed       mad
## ...1            1 18249     24.23      15.48     24.00     23.96     20.76
## Date            2 18249       NaN         NA        NA       NaN        NA
## AveragePrice    3 18249      1.41       0.40      1.37      1.38      0.42
## Total Volume    4 18249 850644.01 3453545.36 107376.76 232479.38 152652.16
## 4046            5 18249 293008.42 1264989.08   8645.30  58604.80  12775.10
## 4225            6 18249 295154.57 1204120.40  29061.02  80079.89  42285.68
## 4770            7 18249  22839.74  107464.07    184.99   3375.02    274.27
## Total Bags      8 18249 239639.20  986242.40  39743.83  67480.14  55300.92
## Small Bags      9 18249 182194.69  746178.51  26362.82  49175.03  37953.80
## Large Bags     10 18249  54338.09  243965.96   2647.71  12057.05   3925.49
## XLarge Bags    11  6201   9141.94   29430.56   1130.00   2916.42   1644.87
## type*          12 18249      1.50       0.50      1.00      1.50      0.00
## year           13 18249   2016.15       0.94   2016.00   2016.10      1.48
## region*        14 18249     27.50      15.58     27.00     27.50     19.27
##                  min         max       range  skew kurtosis       se
## ...1            0.00       52.00       52.00  0.11    -1.25     0.11
## Date             Inf        -Inf        -Inf    NA       NA       NA
## AveragePrice    0.44        3.25        2.81  0.58     0.32     0.00
## Total Volume   84.56 62505646.52 62505561.96  9.01    92.07 25564.99
## 4046            0.00 22743616.17 22743616.17  8.65    86.78  9364.13
## 4225            0.00 20470572.61 20470572.61  8.94    91.91  8913.54
## 4770            0.00  2546439.11  2546439.11 10.16   132.51   795.51
## Total Bags      0.00 19373134.37 19373134.37  9.75   112.23  7300.69
## Small Bags      0.00 13384586.80 13384586.80  9.54   106.97  5523.61
## Large Bags      0.00  5719096.61  5719096.61  9.79   117.95  1805.97
## XLarge Bags     1.00   551693.65   551692.65  7.78    81.28   373.74
## type*           1.00        2.00        1.00  0.00    -2.00     0.00
## year         2015.00     2018.00        3.00  0.22    -1.03     0.01
## region*         1.00       54.00       53.00  0.00    -1.20     0.12
#check for missing values

sum(is.na(Avocado)) #check the entire dataset
## [1] 12048
sum(is.na(Avocado$`XLarge Bags`))
## [1] 12048

Examine Missing Data

sum(is.na(Avocado)) #check the entire dataset
## [1] 12048
sum(is.na(Avocado$`XLarge Bags`)) #examine one variable
## [1] 12048

Impute Missing Data with new Data Frame

Avocado_impute<-Avocado
Avocado_impute$`XLarge Bags`[is.na(Avocado_impute$`XLarge Bags`)]<-median(Avocado_impute$`XLarge Bags`, na.rm=TRUE)

sum(is.na(Avocado_impute$`XLarge Bags`))
## [1] 0
summary(Avocado$`XLarge Bags`)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      1.0    109.7   1130.0   9141.9   4915.0 551693.7    12048
summary(Avocado_impute$`XLarge Bags`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    1130    1130    3852    1130  551694

Impute Missing Data within the Data Frame

Replace Missing with Median of the variable

Avocado$XLimputeMedian<-Avocado$`XLarge Bags`
sum(is.na(Avocado$XLimputeMedian))
## [1] 12048
Avocado$XLimputeMedian [is.na(Avocado$XLimputeMedian )]<-median(Avocado$`XLarge Bags`, na.rm=TRUE)
sum(is.na(Avocado$XLimputeMedian))
## [1] 0
#replace missing with the mean
Avocado$XLimputeMean<-Avocado$`XLarge Bags`
sum(is.na(Avocado$XLimputeMean))
## [1] 12048
Avocado$XLimputeMean [is.na(Avocado$XLimputeMean )]<-median(Avocado$`XLarge Bags`, na.rm=TRUE)
sum(is.na(Avocado$XLimputeMean))
## [1] 0
table(Avocado$region, Avocado$year)
##                      
##                       2015 2016 2017 2018
##   Albany               104  104  106   24
##   Atlanta              104  104  106   24
##   BaltimoreWashington  104  104  106   24
##   Boise                104  104  106   24
##   Boston               104  104  106   24
##   BuffaloRochester     104  104  106   24
##   California           104  104  106   24
##   Charlotte            104  104  106   24
##   Chicago              104  104  106   24
##   CincinnatiDayton     104  104  106   24
##   Columbus             104  104  106   24
##   DallasFtWorth        104  104  106   24
##   Denver               104  104  106   24
##   Detroit              104  104  106   24
##   GrandRapids          104  104  106   24
##   GreatLakes           104  104  106   24
##   HarrisburgScranton   104  104  106   24
##   HartfordSpringfield  104  104  106   24
##   Houston              104  104  106   24
##   Indianapolis         104  104  106   24
##   Jacksonville         104  104  106   24
##   LasVegas             104  104  106   24
##   LosAngeles           104  104  106   24
##   Louisville           104  104  106   24
##   MiamiFtLauderdale    104  104  106   24
##   Midsouth             104  104  106   24
##   Nashville            104  104  106   24
##   NewOrleansMobile     104  104  106   24
##   NewYork              104  104  106   24
##   Northeast            104  104  106   24
##   NorthernNewEngland   104  104  106   24
##   Orlando              104  104  106   24
##   Philadelphia         104  104  106   24
##   PhoenixTucson        104  104  106   24
##   Pittsburgh           104  104  106   24
##   Plains               104  104  106   24
##   Portland             104  104  106   24
##   RaleighGreensboro    104  104  106   24
##   RichmondNorfolk      104  104  106   24
##   Roanoke              104  104  106   24
##   Sacramento           104  104  106   24
##   SanDiego             104  104  106   24
##   SanFrancisco         104  104  106   24
##   Seattle              104  104  106   24
##   SouthCarolina        104  104  106   24
##   SouthCentral         104  104  106   24
##   Southeast            104  104  106   24
##   Spokane              104  104  106   24
##   StLouis              104  104  106   24
##   Syracuse             104  104  106   24
##   Tampa                104  104  106   24
##   TotalUS              104  104  106   24
##   West                 104  104  106   24
##   WestTexNewMexico     103  104  104   24

Scatter Graph

plot(Avocado$AveragePrice~Avocado$`Total Volume`)