Students will learn how to work with real data to prepare it to perform machine learning functions using the tidyverse.
### use raw file from github
laptop_price <- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/laptop_price.csv")
## LOOK AT THE DATA
### structure
str(laptop_price )
## 'data.frame': 1303 obs. of 13 variables:
## $ laptop_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Company : chr "Apple" "Apple" "HP" "Apple" ...
## $ Product : chr "MacBook Pro" "Macbook Air" "250 G6" "MacBook Pro" ...
## $ TypeName : chr "Ultrabook" "Ultrabook" "Notebook" "Ultrabook" ...
## $ Inches : num 13.3 13.3 15.6 15.4 13.3 15.6 15.4 13.3 14 14 ...
## $ ScreenResolution: chr "IPS Panel Retina Display 2560x1600" "1440x900" "Full HD 1920x1080" "IPS Panel Retina Display 2880x1800" ...
## $ Cpu : chr "Intel Core i5 2.3GHz" "Intel Core i5 1.8GHz" "Intel Core i5 7200U 2.5GHz" "Intel Core i7 2.7GHz" ...
## $ Ram : chr "8GB" "8GB" "8GB" "16GB" ...
## $ Memory : chr "128GB SSD" "128GB Flash Storage" "256GB SSD" "512GB SSD" ...
## $ Gpu : chr "Intel Iris Plus Graphics 640" "Intel HD Graphics 6000" "Intel HD Graphics 620" "AMD Radeon Pro 455" ...
## $ OpSys : chr "macOS" "macOS" "No OS" "macOS" ...
## $ Weight : chr "1.37kg" "1.34kg" "1.86kg" "1.83kg" ...
## $ Price_euros : num 1340 899 575 2537 1804 ...
## TIDYVERSE
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
### convert to dollars
laptop_price<-laptop_price%>%
mutate(Price_dollar=Price_euros*1.09)
### summary stats
mean(laptop_price$Price_dollar)
## [1] 1224.819
sd(laptop_price$Price_dollar)
## [1] 761.9199
median(laptop_price$Price_dollar)
## [1] 1064.93
### plot in base
hist(laptop_price$Price_dollar)
### plot in ggplot
ggplot(data=laptop_price, aes(x=Price_dollar))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=laptop_price, aes(x=Price_dollar))+
geom_boxplot()
ggplot(data=laptop_price, aes(y=Price_euros, x=Inches))+
geom_jitter()
ggplot(data=laptop_price,
aes(y=Price_euros, x=Inches, color=OpSys))+
geom_jitter()
# TAKE OFF LABELS for RAM
unique(laptop_price$Ram)
## [1] "8GB" "16GB" "4GB" "2GB" "12GB" "6GB" "32GB" "24GB" "64GB"
#str_remove()
laptop_price$Ram_GB<-str_remove(laptop_price$Ram, "GB")
laptop_price$Ram_GB<-as.numeric(laptop_price$Ram_GB)
unique(laptop_price$Ram_GB)
## [1] 8 16 4 2 12 6 32 24 64
# TAKE OFF LABELS for Weight
unique(laptop_price$Weight)
## [1] "1.37kg" "1.34kg" "1.86kg" "1.83kg" "2.1kg" "2.04kg" "1.3kg"
## [8] "1.6kg" "2.2kg" "0.92kg" "1.22kg" "0.98kg" "2.5kg" "1.62kg"
## [15] "1.91kg" "2.3kg" "1.35kg" "1.88kg" "1.89kg" "1.65kg" "2.71kg"
## [22] "1.2kg" "1.44kg" "2.8kg" "2kg" "2.65kg" "2.77kg" "3.2kg"
## [29] "0.69kg" "1.49kg" "2.4kg" "2.13kg" "2.43kg" "1.7kg" "1.4kg"
## [36] "1.8kg" "1.9kg" "3kg" "1.252kg" "2.7kg" "2.02kg" "1.63kg"
## [43] "1.96kg" "1.21kg" "2.45kg" "1.25kg" "1.5kg" "2.62kg" "1.38kg"
## [50] "1.58kg" "1.85kg" "1.23kg" "1.26kg" "2.16kg" "2.36kg" "2.05kg"
## [57] "1.32kg" "1.75kg" "0.97kg" "2.9kg" "2.56kg" "1.48kg" "1.74kg"
## [64] "1.1kg" "1.56kg" "2.03kg" "1.05kg" "4.4kg" "1.90kg" "1.29kg"
## [71] "2.0kg" "1.95kg" "2.06kg" "1.12kg" "1.42kg" "3.49kg" "3.35kg"
## [78] "2.23kg" "4.42kg" "2.69kg" "2.37kg" "4.7kg" "3.6kg" "2.08kg"
## [85] "4.3kg" "1.68kg" "1.41kg" "4.14kg" "2.18kg" "2.24kg" "2.67kg"
## [92] "2.14kg" "1.36kg" "2.25kg" "2.15kg" "2.19kg" "2.54kg" "3.42kg"
## [99] "1.28kg" "2.33kg" "1.45kg" "2.79kg" "1.84kg" "2.6kg" "2.26kg"
## [106] "3.25kg" "1.59kg" "1.13kg" "1.78kg" "1.10kg" "1.15kg" "1.27kg"
## [113] "1.43kg" "2.31kg" "1.16kg" "1.64kg" "2.17kg" "1.47kg" "3.78kg"
## [120] "1.79kg" "0.91kg" "1.99kg" "4.33kg" "1.93kg" "1.87kg" "2.63kg"
## [127] "3.4kg" "3.14kg" "1.94kg" "1.24kg" "4.6kg" "4.5kg" "2.73kg"
## [134] "1.39kg" "2.29kg" "2.59kg" "2.94kg" "1.14kg" "3.8kg" "3.31kg"
## [141] "1.09kg" "3.21kg" "1.19kg" "1.98kg" "1.17kg" "4.36kg" "1.71kg"
## [148] "2.32kg" "4.2kg" "1.55kg" "0.81kg" "1.18kg" "2.72kg" "1.31kg"
## [155] "0.920kg" "3.74kg" "1.76kg" "1.54kg" "2.83kg" "2.07kg" "2.38kg"
## [162] "3.58kg" "1.08kg" "2.20kg" "2.75kg" "1.70kg" "2.99kg" "1.11kg"
## [169] "2.09kg" "4kg" "3.0kg" "0.99kg" "3.52kg" "2.591kg" "2.21kg"
## [176] "3.3kg" "2.191kg" "2.34kg" "4.0kg"
laptop_price$Weight_kg<-str_remove(laptop_price$Weight, "kg")
laptop_price$Weight_kg<-as.numeric(laptop_price$Weight_kg)
unique(laptop_price$Weight_kg)
## [1] 1.370 1.340 1.860 1.830 2.100 2.040 1.300 1.600 2.200 0.920 1.220 0.980
## [13] 2.500 1.620 1.910 2.300 1.350 1.880 1.890 1.650 2.710 1.200 1.440 2.800
## [25] 2.000 2.650 2.770 3.200 0.690 1.490 2.400 2.130 2.430 1.700 1.400 1.800
## [37] 1.900 3.000 1.252 2.700 2.020 1.630 1.960 1.210 2.450 1.250 1.500 2.620
## [49] 1.380 1.580 1.850 1.230 1.260 2.160 2.360 2.050 1.320 1.750 0.970 2.900
## [61] 2.560 1.480 1.740 1.100 1.560 2.030 1.050 4.400 1.290 1.950 2.060 1.120
## [73] 1.420 3.490 3.350 2.230 4.420 2.690 2.370 4.700 3.600 2.080 4.300 1.680
## [85] 1.410 4.140 2.180 2.240 2.670 2.140 1.360 2.250 2.150 2.190 2.540 3.420
## [97] 1.280 2.330 1.450 2.790 1.840 2.600 2.260 3.250 1.590 1.130 1.780 1.150
## [109] 1.270 1.430 2.310 1.160 1.640 2.170 1.470 3.780 1.790 0.910 1.990 4.330
## [121] 1.930 1.870 2.630 3.400 3.140 1.940 1.240 4.600 4.500 2.730 1.390 2.290
## [133] 2.590 2.940 1.140 3.800 3.310 1.090 3.210 1.190 1.980 1.170 4.360 1.710
## [145] 2.320 4.200 1.550 0.810 1.180 2.720 1.310 3.740 1.760 1.540 2.830 2.070
## [157] 2.380 3.580 1.080 2.750 2.990 1.110 2.090 4.000 0.990 3.520 2.591 2.210
## [169] 3.300 2.191 2.340
ggplot(data=laptop_price,
aes(y=Price_euros, x=Weight))+
geom_point()
ggplot(data=laptop_price,
aes(y=Price_euros, x=Ram_GB))+
geom_point()
apple<-laptop_price%>%
filter(Company=="Apple")
ggplot(data=apple, aes(y=Price_euros, x=Inches, color=Product))+
geom_jitter()
### separate
laptop_price2<-laptop_price%>%
separate(Memory,c("MemAmt","MemType"), sep=" ")
## Warning: Expected 2 pieces. Additional pieces discarded in 282 rows [2, 7, 8,
## 21, 22, 27, 29, 31, 32, 35, 36, 38, 42, 48, 51, 59, 61, 74, 78, 80, ...].
mem<-unique(laptop_price2$MemAmt)
amount<-substr(mem, 1, nchar(mem)-2)
unit<-substr(mem, nchar(mem)-1, nchar(mem))
## MAKE A DATA FRAME
memDF<-data.frame(MemAmt=mem,
Amount=as.numeric(amount),
Units=unit)
## units
unitDF<-data.frame(Units=c("TB", "GB"),
GBs=c(1000, 1))
memGB<-memDF%>%
left_join(unitDF)
## Joining, by = "Units"
memGB
## MemAmt Amount Units GBs
## 1 128GB 128 GB 1
## 2 256GB 256 GB 1
## 3 512GB 512 GB 1
## 4 500GB 500 GB 1
## 5 1TB 1 TB 1000
## 6 32GB 32 GB 1
## 7 64GB 64 GB 1
## 8 2TB 2 TB 1000
## 9 1.0TB 1 TB 1000
## 10 16GB 16 GB 1
## 11 180GB 180 GB 1
## 12 240GB 240 GB 1
## 13 8GB 8 GB 1
## 14 508GB 508 GB 1
convertGB<-memGB%>%
mutate(MemGB=Amount*GBs)
convertGB
## MemAmt Amount Units GBs MemGB
## 1 128GB 128 GB 1 128
## 2 256GB 256 GB 1 256
## 3 512GB 512 GB 1 512
## 4 500GB 500 GB 1 500
## 5 1TB 1 TB 1000 1000
## 6 32GB 32 GB 1 32
## 7 64GB 64 GB 1 64
## 8 2TB 2 TB 1000 2000
## 9 1.0TB 1 TB 1000 1000
## 10 16GB 16 GB 1 16
## 11 180GB 180 GB 1 180
## 12 240GB 240 GB 1 240
## 13 8GB 8 GB 1 8
## 14 508GB 508 GB 1 508
laptop_price_USE<-laptop_price2%>%
left_join(convertGB)
## Joining, by = "MemAmt"
str(laptop_price_USE)
## 'data.frame': 1303 obs. of 21 variables:
## $ laptop_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Company : chr "Apple" "Apple" "HP" "Apple" ...
## $ Product : chr "MacBook Pro" "Macbook Air" "250 G6" "MacBook Pro" ...
## $ TypeName : chr "Ultrabook" "Ultrabook" "Notebook" "Ultrabook" ...
## $ Inches : num 13.3 13.3 15.6 15.4 13.3 15.6 15.4 13.3 14 14 ...
## $ ScreenResolution: chr "IPS Panel Retina Display 2560x1600" "1440x900" "Full HD 1920x1080" "IPS Panel Retina Display 2880x1800" ...
## $ Cpu : chr "Intel Core i5 2.3GHz" "Intel Core i5 1.8GHz" "Intel Core i5 7200U 2.5GHz" "Intel Core i7 2.7GHz" ...
## $ Ram : chr "8GB" "8GB" "8GB" "16GB" ...
## $ MemAmt : chr "128GB" "128GB" "256GB" "512GB" ...
## $ MemType : chr "SSD" "Flash" "SSD" "SSD" ...
## $ Gpu : chr "Intel Iris Plus Graphics 640" "Intel HD Graphics 6000" "Intel HD Graphics 620" "AMD Radeon Pro 455" ...
## $ OpSys : chr "macOS" "macOS" "No OS" "macOS" ...
## $ Weight : chr "1.37kg" "1.34kg" "1.86kg" "1.83kg" ...
## $ Price_euros : num 1340 899 575 2537 1804 ...
## $ Price_dollar : num 1460 980 627 2766 1966 ...
## $ Ram_GB : num 8 8 8 16 8 4 16 8 16 8 ...
## $ Weight_kg : num 1.37 1.34 1.86 1.83 1.37 2.1 2.04 1.34 1.3 1.6 ...
## $ Amount : num 128 128 256 512 256 500 256 256 512 256 ...
## $ Units : chr "GB" "GB" "GB" "GB" ...
## $ GBs : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MemGB : num 128 128 256 512 256 500 256 256 512 256 ...