Learning Objectives

Students will learn how to work with real data to prepare it to perform machine learning functions using the tidyverse.

Importing Data

### use raw file from github
laptop_price <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/laptop_price.csv")

Looking at Data Structure

## LOOK AT THE DATA
### structure
str(laptop_price )
## 'data.frame':    1303 obs. of  13 variables:
##  $ laptop_ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Company         : chr  "Apple" "Apple" "HP" "Apple" ...
##  $ Product         : chr  "MacBook Pro" "Macbook Air" "250 G6" "MacBook Pro" ...
##  $ TypeName        : chr  "Ultrabook" "Ultrabook" "Notebook" "Ultrabook" ...
##  $ Inches          : num  13.3 13.3 15.6 15.4 13.3 15.6 15.4 13.3 14 14 ...
##  $ ScreenResolution: chr  "IPS Panel Retina Display 2560x1600" "1440x900" "Full HD 1920x1080" "IPS Panel Retina Display 2880x1800" ...
##  $ Cpu             : chr  "Intel Core i5 2.3GHz" "Intel Core i5 1.8GHz" "Intel Core i5 7200U 2.5GHz" "Intel Core i7 2.7GHz" ...
##  $ Ram             : chr  "8GB" "8GB" "8GB" "16GB" ...
##  $ Memory          : chr  "128GB SSD" "128GB Flash Storage" "256GB SSD" "512GB SSD" ...
##  $ Gpu             : chr  "Intel Iris Plus Graphics 640" "Intel HD Graphics 6000" "Intel HD Graphics 620" "AMD Radeon Pro 455" ...
##  $ OpSys           : chr  "macOS" "macOS" "No OS" "macOS" ...
##  $ Weight          : chr  "1.37kg" "1.34kg" "1.86kg" "1.83kg" ...
##  $ Price_euros     : num  1340 899 575 2537 1804 ...

Tidyverse

## TIDYVERSE
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Mutate

### convert to dollars
laptop_price<-laptop_price%>%
  mutate(Price_dollar=Price_euros*1.09)

Summary Statistics

### summary stats
mean(laptop_price$Price_dollar)
## [1] 1224.819
sd(laptop_price$Price_dollar)
## [1] 761.9199
median(laptop_price$Price_dollar)
## [1] 1064.93

Graphics

### plot in base
hist(laptop_price$Price_dollar)

### plot in ggplot
ggplot(data=laptop_price, aes(x=Price_dollar))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=laptop_price, aes(x=Price_dollar))+
  geom_boxplot()

ggplot(data=laptop_price, aes(y=Price_euros, x=Inches))+
  geom_jitter()

ggplot(data=laptop_price, 
       aes(y=Price_euros, x=Inches, color=OpSys))+
  geom_jitter()

Stringr

# TAKE OFF LABELS for RAM
unique(laptop_price$Ram)
## [1] "8GB"  "16GB" "4GB"  "2GB"  "12GB" "6GB"  "32GB" "24GB" "64GB"
#str_remove()
laptop_price$Ram_GB<-str_remove(laptop_price$Ram, "GB")
laptop_price$Ram_GB<-as.numeric(laptop_price$Ram_GB)
unique(laptop_price$Ram_GB)
## [1]  8 16  4  2 12  6 32 24 64
# TAKE OFF LABELS for Weight
unique(laptop_price$Weight)
##   [1] "1.37kg"  "1.34kg"  "1.86kg"  "1.83kg"  "2.1kg"   "2.04kg"  "1.3kg"  
##   [8] "1.6kg"   "2.2kg"   "0.92kg"  "1.22kg"  "0.98kg"  "2.5kg"   "1.62kg" 
##  [15] "1.91kg"  "2.3kg"   "1.35kg"  "1.88kg"  "1.89kg"  "1.65kg"  "2.71kg" 
##  [22] "1.2kg"   "1.44kg"  "2.8kg"   "2kg"     "2.65kg"  "2.77kg"  "3.2kg"  
##  [29] "0.69kg"  "1.49kg"  "2.4kg"   "2.13kg"  "2.43kg"  "1.7kg"   "1.4kg"  
##  [36] "1.8kg"   "1.9kg"   "3kg"     "1.252kg" "2.7kg"   "2.02kg"  "1.63kg" 
##  [43] "1.96kg"  "1.21kg"  "2.45kg"  "1.25kg"  "1.5kg"   "2.62kg"  "1.38kg" 
##  [50] "1.58kg"  "1.85kg"  "1.23kg"  "1.26kg"  "2.16kg"  "2.36kg"  "2.05kg" 
##  [57] "1.32kg"  "1.75kg"  "0.97kg"  "2.9kg"   "2.56kg"  "1.48kg"  "1.74kg" 
##  [64] "1.1kg"   "1.56kg"  "2.03kg"  "1.05kg"  "4.4kg"   "1.90kg"  "1.29kg" 
##  [71] "2.0kg"   "1.95kg"  "2.06kg"  "1.12kg"  "1.42kg"  "3.49kg"  "3.35kg" 
##  [78] "2.23kg"  "4.42kg"  "2.69kg"  "2.37kg"  "4.7kg"   "3.6kg"   "2.08kg" 
##  [85] "4.3kg"   "1.68kg"  "1.41kg"  "4.14kg"  "2.18kg"  "2.24kg"  "2.67kg" 
##  [92] "2.14kg"  "1.36kg"  "2.25kg"  "2.15kg"  "2.19kg"  "2.54kg"  "3.42kg" 
##  [99] "1.28kg"  "2.33kg"  "1.45kg"  "2.79kg"  "1.84kg"  "2.6kg"   "2.26kg" 
## [106] "3.25kg"  "1.59kg"  "1.13kg"  "1.78kg"  "1.10kg"  "1.15kg"  "1.27kg" 
## [113] "1.43kg"  "2.31kg"  "1.16kg"  "1.64kg"  "2.17kg"  "1.47kg"  "3.78kg" 
## [120] "1.79kg"  "0.91kg"  "1.99kg"  "4.33kg"  "1.93kg"  "1.87kg"  "2.63kg" 
## [127] "3.4kg"   "3.14kg"  "1.94kg"  "1.24kg"  "4.6kg"   "4.5kg"   "2.73kg" 
## [134] "1.39kg"  "2.29kg"  "2.59kg"  "2.94kg"  "1.14kg"  "3.8kg"   "3.31kg" 
## [141] "1.09kg"  "3.21kg"  "1.19kg"  "1.98kg"  "1.17kg"  "4.36kg"  "1.71kg" 
## [148] "2.32kg"  "4.2kg"   "1.55kg"  "0.81kg"  "1.18kg"  "2.72kg"  "1.31kg" 
## [155] "0.920kg" "3.74kg"  "1.76kg"  "1.54kg"  "2.83kg"  "2.07kg"  "2.38kg" 
## [162] "3.58kg"  "1.08kg"  "2.20kg"  "2.75kg"  "1.70kg"  "2.99kg"  "1.11kg" 
## [169] "2.09kg"  "4kg"     "3.0kg"   "0.99kg"  "3.52kg"  "2.591kg" "2.21kg" 
## [176] "3.3kg"   "2.191kg" "2.34kg"  "4.0kg"
laptop_price$Weight_kg<-str_remove(laptop_price$Weight, "kg")
laptop_price$Weight_kg<-as.numeric(laptop_price$Weight_kg)
unique(laptop_price$Weight_kg)
##   [1] 1.370 1.340 1.860 1.830 2.100 2.040 1.300 1.600 2.200 0.920 1.220 0.980
##  [13] 2.500 1.620 1.910 2.300 1.350 1.880 1.890 1.650 2.710 1.200 1.440 2.800
##  [25] 2.000 2.650 2.770 3.200 0.690 1.490 2.400 2.130 2.430 1.700 1.400 1.800
##  [37] 1.900 3.000 1.252 2.700 2.020 1.630 1.960 1.210 2.450 1.250 1.500 2.620
##  [49] 1.380 1.580 1.850 1.230 1.260 2.160 2.360 2.050 1.320 1.750 0.970 2.900
##  [61] 2.560 1.480 1.740 1.100 1.560 2.030 1.050 4.400 1.290 1.950 2.060 1.120
##  [73] 1.420 3.490 3.350 2.230 4.420 2.690 2.370 4.700 3.600 2.080 4.300 1.680
##  [85] 1.410 4.140 2.180 2.240 2.670 2.140 1.360 2.250 2.150 2.190 2.540 3.420
##  [97] 1.280 2.330 1.450 2.790 1.840 2.600 2.260 3.250 1.590 1.130 1.780 1.150
## [109] 1.270 1.430 2.310 1.160 1.640 2.170 1.470 3.780 1.790 0.910 1.990 4.330
## [121] 1.930 1.870 2.630 3.400 3.140 1.940 1.240 4.600 4.500 2.730 1.390 2.290
## [133] 2.590 2.940 1.140 3.800 3.310 1.090 3.210 1.190 1.980 1.170 4.360 1.710
## [145] 2.320 4.200 1.550 0.810 1.180 2.720 1.310 3.740 1.760 1.540 2.830 2.070
## [157] 2.380 3.580 1.080 2.750 2.990 1.110 2.090 4.000 0.990 3.520 2.591 2.210
## [169] 3.300 2.191 2.340

New Graphics

ggplot(data=laptop_price, 
       aes(y=Price_euros, x=Weight))+
  geom_point()

ggplot(data=laptop_price, 
       aes(y=Price_euros, x=Ram_GB))+
  geom_point()

Filter

apple<-laptop_price%>%
  filter(Company=="Apple")

ggplot(data=apple, aes(y=Price_euros, x=Inches, color=Product))+
  geom_jitter()

CHALLENGE: Separate

### separate
laptop_price2<-laptop_price%>%
  separate(Memory,c("MemAmt","MemType"), sep=" ")
## Warning: Expected 2 pieces. Additional pieces discarded in 282 rows [2, 7, 8,
## 21, 22, 27, 29, 31, 32, 35, 36, 38, 42, 48, 51, 59, 61, 74, 78, 80, ...].
mem<-unique(laptop_price2$MemAmt)
amount<-substr(mem, 1, nchar(mem)-2)
unit<-substr(mem, nchar(mem)-1, nchar(mem))

## MAKE A DATA FRAME
memDF<-data.frame(MemAmt=mem, 
                  Amount=as.numeric(amount), 
                  Units=unit)

CHALLENGE: Join

## units
unitDF<-data.frame(Units=c("TB", "GB"), 
                   GBs=c(1000, 1))

memGB<-memDF%>%
  left_join(unitDF)
## Joining, by = "Units"
memGB
##    MemAmt Amount Units  GBs
## 1   128GB    128    GB    1
## 2   256GB    256    GB    1
## 3   512GB    512    GB    1
## 4   500GB    500    GB    1
## 5     1TB      1    TB 1000
## 6    32GB     32    GB    1
## 7    64GB     64    GB    1
## 8     2TB      2    TB 1000
## 9   1.0TB      1    TB 1000
## 10   16GB     16    GB    1
## 11  180GB    180    GB    1
## 12  240GB    240    GB    1
## 13    8GB      8    GB    1
## 14  508GB    508    GB    1
convertGB<-memGB%>%
  mutate(MemGB=Amount*GBs)

convertGB
##    MemAmt Amount Units  GBs MemGB
## 1   128GB    128    GB    1   128
## 2   256GB    256    GB    1   256
## 3   512GB    512    GB    1   512
## 4   500GB    500    GB    1   500
## 5     1TB      1    TB 1000  1000
## 6    32GB     32    GB    1    32
## 7    64GB     64    GB    1    64
## 8     2TB      2    TB 1000  2000
## 9   1.0TB      1    TB 1000  1000
## 10   16GB     16    GB    1    16
## 11  180GB    180    GB    1   180
## 12  240GB    240    GB    1   240
## 13    8GB      8    GB    1     8
## 14  508GB    508    GB    1   508
laptop_price_USE<-laptop_price2%>%
  left_join(convertGB)
## Joining, by = "MemAmt"
str(laptop_price_USE)
## 'data.frame':    1303 obs. of  21 variables:
##  $ laptop_ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Company         : chr  "Apple" "Apple" "HP" "Apple" ...
##  $ Product         : chr  "MacBook Pro" "Macbook Air" "250 G6" "MacBook Pro" ...
##  $ TypeName        : chr  "Ultrabook" "Ultrabook" "Notebook" "Ultrabook" ...
##  $ Inches          : num  13.3 13.3 15.6 15.4 13.3 15.6 15.4 13.3 14 14 ...
##  $ ScreenResolution: chr  "IPS Panel Retina Display 2560x1600" "1440x900" "Full HD 1920x1080" "IPS Panel Retina Display 2880x1800" ...
##  $ Cpu             : chr  "Intel Core i5 2.3GHz" "Intel Core i5 1.8GHz" "Intel Core i5 7200U 2.5GHz" "Intel Core i7 2.7GHz" ...
##  $ Ram             : chr  "8GB" "8GB" "8GB" "16GB" ...
##  $ MemAmt          : chr  "128GB" "128GB" "256GB" "512GB" ...
##  $ MemType         : chr  "SSD" "Flash" "SSD" "SSD" ...
##  $ Gpu             : chr  "Intel Iris Plus Graphics 640" "Intel HD Graphics 6000" "Intel HD Graphics 620" "AMD Radeon Pro 455" ...
##  $ OpSys           : chr  "macOS" "macOS" "No OS" "macOS" ...
##  $ Weight          : chr  "1.37kg" "1.34kg" "1.86kg" "1.83kg" ...
##  $ Price_euros     : num  1340 899 575 2537 1804 ...
##  $ Price_dollar    : num  1460 980 627 2766 1966 ...
##  $ Ram_GB          : num  8 8 8 16 8 4 16 8 16 8 ...
##  $ Weight_kg       : num  1.37 1.34 1.86 1.83 1.37 2.1 2.04 1.34 1.3 1.6 ...
##  $ Amount          : num  128 128 256 512 256 500 256 256 512 256 ...
##  $ Units           : chr  "GB" "GB" "GB" "GB" ...
##  $ GBs             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ MemGB           : num  128 128 256 512 256 500 256 256 512 256 ...