library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cast <- read.csv("raw_cast.csv")
colSums(is.na(cast))
## X Name.1 Name.2
## 0 0 0
missing_percentage <- colSums(is.na(cast)) / nrow(cast) * 100
print(missing_percentage)
## X Name.1 Name.2
## 0 0 0
sum(duplicated(cast))
## [1] 0
sum(is.na(cast))
## [1] 0
summary(cast)
## X Name.1 Name.2
## Min. : 1.0 Length:1298 Length:1298
## 1st Qu.: 325.2 Class :character Class :character
## Median : 649.5 Mode :character Mode :character
## Mean : 649.5
## 3rd Qu.: 973.8
## Max. :1298.0
str(cast)
## 'data.frame': 1298 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name.1: chr "Angela Bassett" "Peter Krause" "Oliver Stark" "Aisha Hinds" ...
## $ Name.2: chr "Athena Grant\n 87 episodes, 2018-2022" "Bobby Nash\n 87 episodes, 2018-2022" "Evan 'Buck' Buckley\n 87 episodes, 2018-2022" "Henrietta 'Hen' Wilson\n 87 episodes, 2018-2022" ...
data <- c(1, 3, NA, 4, NA, 6)
is.na(data)
## [1] FALSE FALSE TRUE FALSE TRUE FALSE
df <- data.frame(var1 = c(1, 2, NA, 4), var2 = c("A", "B", "C", NA))
clean_df <- df[complete.cases(df), ]
clean_df <- na.omit(df)
print(clean_df)
## var1 var2
## 1 1 A
## 2 2 B
library(readr)
bike_complete <- read_csv("bike_sharing_data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(stringr)
bike_complete$weather <- factor(bike_complete$weather,
levels = c(1,2,3,4),
labels = c("Clear","Mist","Light Snow/Rain", "Heavy Snow/Rain"))
library(tidyverse)
cast <- read.csv("raw_cast.csv")
colSums(is.na(cast))
## X Name.1 Name.2
## 0 0 0
missing_percentage <- colSums(is.na(cast)) / nrow(cast) * 100
print(missing_percentage)
## X Name.1 Name.2
## 0 0 0
sum(duplicated(cast))
## [1] 0
library(tidyverse)
cast <- read.csv("raw_cast.csv")
cast <- cast %>%
separate(`Name.2`, into = c("Name2", "Episode"), sep = "\n", extra = "merge", fill = "right")
head(cast)
## X Name.1 Name2
## 1 1 Angela Bassett Athena Grant
## 2 2 Peter Krause Bobby Nash
## 3 3 Oliver Stark Evan 'Buck' Buckley
## 4 4 Aisha Hinds Henrietta 'Hen' Wilson
## 5 5 Kenneth Choi Howie 'Chimney' Han
## 6 6 Corinne Massiah May Grant
## Episode
## 1 87 episodes, 2018-2022
## 2 87 episodes, 2018-2022
## 3 87 episodes, 2018-2022
## 4 87 episodes, 2018-2022
## 5 87 episodes, 2018-2022
## 6 82 episodes, 2018-2022
library(stringr)
split_data <- str_split_fixed(cast$`Name.2`, "\n", n = 2)
print(split_data)
## [,1] [,2]
library(stringr)
cast$Episode <- str_trim(cast$Episode, side = "both")
head(cast$Episode)
## [1] "87 episodes, 2018-2022" "87 episodes, 2018-2022" "87 episodes, 2018-2022"
## [4] "87 episodes, 2018-2022" "87 episodes, 2018-2022" "82 episodes, 2018-2022"