Week 8: Apply it to your data 7

Import your data

data <- read_excel("../00_data/MyData.xlsx")

data_small <- data %>%
    
    select(Brand, Year, Kilometers_Driven) 

data_small

## # A tibble: 100 × 3
##    Brand       Year Kilometers_Driven
##    <chr>      <dbl>             <dbl>
##  1 Toyota      2018             50000
##  2 Honda       2019             40000
##  3 Ford        2017             20000
##  4 Maruti      2020             30000
##  5 Hyundai     2016             60000
##  6 Tata        2019             35000
##  7 Mahindra    2018             45000
##  8 Volkswagen  2020             25000
##  9 Audi        2017             30000
## 10 BMW         2019             20000
## # ℹ 90 more rows

Pivoting

wide to long form

data_small %>%
    
    pivot_wider(names_from = Year,
                values_from = Kilometers_Driven)

## Warning: Values from `Kilometers_Driven` are not uniquely identified; output will
## contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} |>
##   dplyr::summarise(n = dplyr::n(), .by = c(Brand, Year)) |>
##   dplyr::filter(n > 1L)

## # A tibble: 11 × 7
##    Brand      `2018`    `2019`    `2017`    `2020`    `2016`    `2021`   
##    <chr>      <list>    <list>    <list>    <list>    <list>    <list>   
##  1 Toyota     <dbl [4]> <NULL>    <dbl [2]> <dbl [2]> <dbl [2]> <NULL>   
##  2 Honda      <dbl [4]> <dbl [1]> <dbl [1]> <NULL>    <NULL>    <NULL>   
##  3 Ford       <dbl [2]> <dbl [4]> <dbl [4]> <dbl [1]> <NULL>    <NULL>   
##  4 Maruti     <NULL>    <dbl [2]> <NULL>    <dbl [4]> <NULL>    <NULL>   
##  5 Hyundai    <dbl [2]> <dbl [6]> <dbl [2]> <NULL>    <dbl [1]> <NULL>   
##  6 Tata       <dbl [5]> <dbl [1]> <NULL>    <dbl [5]> <NULL>    <NULL>   
##  7 Mahindra   <dbl [1]> <dbl [2]> <NULL>    <NULL>    <NULL>    <dbl [2]>
##  8 Volkswagen <dbl [1]> <dbl [2]> <dbl [4]> <dbl [3]> <NULL>    <NULL>   
##  9 Audi       <dbl [4]> <NULL>    <dbl [5]> <NULL>    <dbl [1]> <NULL>   
## 10 BMW        <dbl [2]> <dbl [6]> <NULL>    <dbl [2]> <NULL>    <NULL>   
## 11 Mercedes   <dbl [3]> <dbl [4]> <dbl [3]> <NULL>    <NULL>    <NULL>

data_small_unique <- data_small %>%
    
    slice(-10)

data_small_unique

## # A tibble: 99 × 3
##    Brand       Year Kilometers_Driven
##    <chr>      <dbl>             <dbl>
##  1 Toyota      2018             50000
##  2 Honda       2019             40000
##  3 Ford        2017             20000
##  4 Maruti      2020             30000
##  5 Hyundai     2016             60000
##  6 Tata        2019             35000
##  7 Mahindra    2018             45000
##  8 Volkswagen  2020             25000
##  9 Audi        2017             30000
## 10 Mercedes    2018             28000
## # ℹ 89 more rows

long to wide form

data_wide <- data_small_unique %>% 
    
    pivot_wider(names_from = Year, 
                values_from = Kilometers_Driven)

## Warning: Values from `Kilometers_Driven` are not uniquely identified; output will
## contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} |>
##   dplyr::summarise(n = dplyr::n(), .by = c(Brand, Year)) |>
##   dplyr::filter(n > 1L)

data_long <- data_wide %>%
    
    pivot_longer(`2018` : `2021`, names_to = "Year", values_to = "Kilometers_Driven", values_drop_na = TRUE)

data_long

## # A tibble: 36 × 3
##    Brand  Year  Kilometers_Driven
##    <chr>  <chr> <list>           
##  1 Toyota 2018  <dbl [4]>        
##  2 Toyota 2017  <dbl [2]>        
##  3 Toyota 2020  <dbl [2]>        
##  4 Toyota 2016  <dbl [2]>        
##  5 Honda  2018  <dbl [4]>        
##  6 Honda  2019  <dbl [1]>        
##  7 Honda  2017  <dbl [1]>        
##  8 Ford   2018  <dbl [2]>        
##  9 Ford   2019  <dbl [4]>        
## 10 Ford   2017  <dbl [4]>        
## # ℹ 26 more rows

##Note For Dr.Lee I belive all of the codes above are done correctly, since I was using the help from the videos. Im still very confused about the end result of some of the codes.

Separating and Uniting

Unite two columns

data_united <- data %>%
    
    unite(col = "YrKm", Year:Kilometers_Driven, sep = "/", remove = TRUE)

data_united

## # A tibble: 100 × 11
##    Brand      Model YrKm  Fuel_Type Transmission Owner_Type Mileage Engine Power
##    <chr>      <chr> <chr> <chr>     <chr>        <chr>        <dbl>  <dbl> <dbl>
##  1 Toyota     Coro… 2018… Petrol    Manual       First           15   1498   108
##  2 Honda      Civic 2019… Petrol    Automatic    Second          17   1597   140
##  3 Ford       Must… 2017… Petrol    Automatic    First           10   4951   395
##  4 Maruti     Swift 2020… Diesel    Manual       Third           23   1248    74
##  5 Hyundai    Sona… 2016… Diesel    Automatic    Second          18   1999   194
##  6 Tata       Nexon 2019… Petrol    Manual       First           17   1198   108
##  7 Mahindra   Scor… 2018… Diesel    Automatic    Second          15   2179   140
##  8 Volkswagen Polo  2020… Petrol    Automatic    First           18    999    76
##  9 Audi       A4    2017… Diesel    Automatic    First           18   1968   187
## 10 BMW        X1    2019… Diesel    Automatic    Second          20   1995   190
## # ℹ 90 more rows
## # ℹ 2 more variables: Seats <dbl>, Price <dbl>

Separate a column

data_united %>%
    
    separate(col = YrKm, into = c("Year", "Kilometers_Driven"), sep = "/")

## # A tibble: 100 × 12
##    Brand Model Year  Kilometers_Driven Fuel_Type Transmission Owner_Type Mileage
##    <chr> <chr> <chr> <chr>             <chr>     <chr>        <chr>        <dbl>
##  1 Toyo… Coro… 2018  50000             Petrol    Manual       First           15
##  2 Honda Civic 2019  40000             Petrol    Automatic    Second          17
##  3 Ford  Must… 2017  20000             Petrol    Automatic    First           10
##  4 Maru… Swift 2020  30000             Diesel    Manual       Third           23
##  5 Hyun… Sona… 2016  60000             Diesel    Automatic    Second          18
##  6 Tata  Nexon 2019  35000             Petrol    Manual       First           17
##  7 Mahi… Scor… 2018  45000             Diesel    Automatic    Second          15
##  8 Volk… Polo  2020  25000             Petrol    Automatic    First           18
##  9 Audi  A4    2017  30000             Diesel    Automatic    First           18
## 10 BMW   X1    2019  20000             Diesel    Automatic    Second          20
## # ℹ 90 more rows
## # ℹ 4 more variables: Engine <dbl>, Power <dbl>, Seats <dbl>, Price <dbl>

Missing Values

stocks <- tibble(
  year   = c(2019, 2019, 2019, 2019, 2020, 2020, 2020),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

stocks %>%
    
    pivot_wider(names_from = year, values_from = return)

## # A tibble: 4 × 3
##     qtr `2019` `2020`
##   <dbl>  <dbl>  <dbl>
## 1     1   1.88  NA   
## 2     2   0.59   0.92
## 3     3   0.35   0.17
## 4     4  NA      2.66

bikes <- tibble(
  bike_model  = c("A", "A", "B", "B", "C"),
  material   = c("steel", "aluminium", "steel", "aluminium", "steel"),
  price = c(100, 200, 300, 400, 500)
)
bikes

## # A tibble: 5 × 3
##   bike_model material  price
##   <chr>      <chr>     <dbl>
## 1 A          steel       100
## 2 A          aluminium   200
## 3 B          steel       300
## 4 B          aluminium   400
## 5 C          steel       500

bikes %>%
    
    pivot_wider(names_from = bike_model, values_from = price)

## # A tibble: 2 × 4
##   material      A     B     C
##   <chr>     <dbl> <dbl> <dbl>
## 1 steel       100   300   500
## 2 aluminium   200   400    NA

bikes %>%
    
    complete(bike_model, material)

## # A tibble: 6 × 3
##   bike_model material  price
##   <chr>      <chr>     <dbl>
## 1 A          aluminium   200
## 2 A          steel       100
## 3 B          aluminium   400
## 4 B          steel       300
## 5 C          aluminium    NA
## 6 C          steel       500

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 6,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

treatment %>%
    
    fill(person, .direction = "up")

## # A tibble: 4 × 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         6        7
## 2 Katherine Burke          2       10
## 3 Katherine Burke          3        9
## 4 Katherine Burke          1        4

Week 8: Apply it to your data 7

Julius Mondschein

2024-06-11

Import your data

Pivoting

wide to long form

long to wide form

Separating and Uniting

Unite two columns

Separate a column

Missing Values