Assignment 4

data <- c(1, 3, NA, 4, NA, 6)

is.na(data)

## [1] FALSE FALSE  TRUE FALSE  TRUE FALSE

df <- data.frame(var1 = c(1, 2, NA, 4), var2 = c("A", "B", "C", NA))

clean_df <- df[complete.cases(df), ]

clean_df <- na.omit(df)

print(clean_df)

##   var1 var2
## 1    1    A
## 2    2    B

library(readr)
bike_complete <- read_csv("~/Desktop/BusinessIntelligence/bike_sharing_data.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(stringr)
bike_complete$weather <- factor(bike_complete$weather, 
                                levels = c(1,2,3,4), 
                                labels = c("Clear","Mist","Light Snow/Rain", "Heavy Snow/Rain"))

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ ggplot2   3.4.4     ✔ tidyr     1.3.0
## ✔ lubridate 1.9.3     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

cast <- read.csv("~/Desktop/BusinessIntelligence/raw_cast.csv")

colSums(is.na(cast))

##      X Name.1 Name.2 
##      0      0      0

missing_percentage <- colSums(is.na(cast)) / nrow(cast) * 100
print(missing_percentage)

##      X Name.1 Name.2 
##      0      0      0

sum(duplicated(cast))

## [1] 0

library(tidyverse)

cast <- read.csv("~/Desktop/BusinessIntelligence/raw_cast.csv")

cast <- cast %>%
  separate(`Name.2`, into = c("Name2", "Episode"), sep = "\n", extra = "merge", fill = "right")

head(cast)

##   X          Name.1                  Name2
## 1 1  Angela Bassett           Athena Grant
## 2 2    Peter Krause             Bobby Nash
## 3 3    Oliver Stark    Evan 'Buck' Buckley
## 4 4     Aisha Hinds Henrietta 'Hen' Wilson
## 5 5    Kenneth Choi    Howie 'Chimney' Han
## 6 6 Corinne Massiah              May Grant
##                                    Episode
## 1                   87 episodes, 2018-2022
## 2                   87 episodes, 2018-2022
## 3                   87 episodes, 2018-2022
## 4                   87 episodes, 2018-2022
## 5                   87 episodes, 2018-2022
## 6                   82 episodes, 2018-2022

library(stringr)

split_data <- str_split_fixed(cast$`Name.2`, "\n", n = 2)
print(split_data)

##      [,1] [,2]

library(stringr)

cast$Episode <- str_trim(cast$Episode, side = "both")

head(cast$Episode)

## [1] "87 episodes, 2018-2022" "87 episodes, 2018-2022" "87 episodes, 2018-2022"
## [4] "87 episodes, 2018-2022" "87 episodes, 2018-2022" "82 episodes, 2018-2022"

Assignment 4

Melissa Conti

2025-02-25