library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)

setwd("C:/Users/StarKid/Desktop/Data_Science/Data_101/week_4/IC8/")
mnms <- read_csv("MnM_Data.csv")
## Rows: 382 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): student_id, color, defect
## dbl (3): id, total, weight_grams
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Basic EDA: Look Around

dim(mnms)
## [1] 382   6
str(mnms)
## spc_tbl_ [382 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ student_id  : chr [1:382] "AP_LV" "AP_LV" "AP_LV" "AP_LV" ...
##  $ id          : num [1:382] 1 2 3 4 5 6 7 8 9 10 ...
##  $ color       : chr [1:382] "r" "r" "r" "r" ...
##  $ defect      : chr [1:382] "c" "l" "z" "z" ...
##  $ total       : num [1:382] 27 27 27 27 27 27 27 27 27 27 ...
##  $ weight_grams: num [1:382] 40 40 40 40 40 40 40 40 40 40 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   student_id = col_character(),
##   ..   id = col_double(),
##   ..   color = col_character(),
##   ..   defect = col_character(),
##   ..   total = col_double(),
##   ..   weight_grams = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
glimpse(mnms)
## Rows: 382
## Columns: 6
## $ student_id   <chr> "AP_LV", "AP_LV", "AP_LV", "AP_LV", "AP_LV", "AP_LV", "AP…
## $ id           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ color        <chr> "r", "r", "r", "r", "r", "o", "o", "o", "o", "o", "y", "y…
## $ defect       <chr> "c", "l", "z", "z", "z", "l", "z", "z", "z", "z", "z", "z…
## $ total        <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2…
## $ weight_grams <dbl> 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 4…
summary(mnms)
##   student_id              id          color              defect         
##  Length:382         Min.   : 1.0   Length:382         Length:382        
##  Class :character   1st Qu.:10.0   Class :character   Class :character  
##  Mode  :character   Median :20.0   Mode  :character   Mode  :character  
##                     Mean   :22.8                                        
##                     3rd Qu.:35.0                                        
##                     Max.   :55.0                                        
##      total        weight_grams  
##  Min.   :18.00   Min.   :25.00  
##  1st Qu.:27.00   1st Qu.:40.00  
##  Median :54.00   Median :48.00  
##  Mean   :45.01   Mean   :44.25  
##  3rd Qu.:55.00   3rd Qu.:50.00  
##  Max.   :56.00   Max.   :50.00
colSums(is.na(mnms))
##   student_id           id        color       defect        total weight_grams 
##            0            0            0            0            0            0
max(mnms$weight_grams, na.rm = T)
## [1] 50
min(mnms$weight_grams)
## [1] 25
mean(mnms$weight_grams, na.rm = TRUE)
## [1] 44.24607
table(mnms$defect)
## 
##   c   l   m   z 
##  33  42   9 298
table(mnms$color, mnms$defect)
##     
##       c  l  m  z
##   bl  7 13  4 58
##   br  2  3  0 51
##   g   5 10  1 35
##   o   6  8  2 71
##   r   6  4  1 36
##   y   7  4  1 47

added density variable. freq = false because it helps you add the density line to the histogram.

dens <- density(mnms$weight_grams)
hist(mnms$weight_grams, freq = FALSE)
lines(dens)

barplot(table(mnms$color), col = c("blue", "brown", "green", "orange", "red", "yellow"),
        main = "my M$M Color Distribution")

two categorical variables aka contingency table

table(mnms$defect, mnms$color)
##    
##     bl br  g  o  r  y
##   c  7  2  5  6  6  7
##   l 13  3 10  8  4  4
##   m  4  0  1  2  1  1
##   z 58 51 35 71 36 47

bar chart

bar_data <- table(mnms$defect, mnms$color)
mnm_col <- c("blue", "brown", "green", "orange", "red", "yellow")
barplot(bar_data, col = mnm_col, legend.text = TRUE, args.legend = list(x
= "topright", horiz= TRUE))

MnMs Weight Distribution

hist(mnms$weight_grams, col = "blue", xlab = "Weight in Grams", ylab =
       "Frequencey",
            main = "MnMs Weight Distribution")

 boxplot(mnms$weight_grams)

means <- tapply(mnms$weight_grams, mnms$color, mean)
boxplot(mnms$weight_grams ~ mnms$color, xlab = "Color", ylab = "Grams",
        col = mnm_col,
        main = "MnM Box Plot")
points(means, col = "black", pch = 19)

Pie charts

count_colors <-table(mnms$color)
perc_val <- round(100 * count_colors/ sum(count_colors), 1)
pie(count_colors, col = mnm_col, labels = mnm_col, main = "MnMs Pie Chart")

#

Scatter Plots

data("airquality")
plot(airquality$Ozone, airquality$Temp, xlab = "Ozone", ylab = "Wind")

pairs(airquality)

pairs_cols <- c("blue", "green", "red", "black")
pairs(airquality[, c("Ozone", "Solar.R", "Wind", "Temp")],
      col = pairs_cols,
      lower.panel = NULL)