suppressMessages(library(tidyverse))
Baltimore_crime <- read_csv(file = "./data/BPD_Part_1_Victim_Based_Crime_Data.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## CrimeDate = col_character(),
## CrimeTime = col_character(),
## CrimeCode = col_character(),
## Location = col_character(),
## Description = col_character(),
## `Inside/Outside` = col_character(),
## Weapon = col_character(),
## Post = col_double(),
## District = col_character(),
## Neighborhood = col_character(),
## Longitude = col_double(),
## Latitude = col_double(),
## `Location 1` = col_character(),
## Premise = col_character(),
## crimeCaseNumber = col_logical(),
## `Total Incidents` = col_double()
## )
head(Baltimore_crime)
# cauculate the length of CrimeTime with each type.
Baltimore_crime %>%
select(CrimeTime) %>%
mutate(length_crimeTime = str_length(CrimeTime)) %>%
#str_length: calculate the length of number length
group_by(length_crimeTime) %>%
summarise(count = length(length_crimeTime))
# select the CrimeTime of each row looks like
Baltimore_crime %>%
filter(str_length(CrimeTime) == 7 )
# Let's do it!!
table_2 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 2) %>%
mutate(CrimeTime = parse_time(CrimeTime, format = "%H"))
#%H default all zero after Hours
table_3 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 3) %>%
mutate(CrimeTime = paste("0",CrimeTime, sep=""),
#sep: combined two kinds of stuff with "", can put anything inside to be a glue
CrimeTime = parse_time(CrimeTime, format = "%H%M" ))
table_4 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 4) %>%
mutate(CrimeTime = recode(CrimeTime, "2400" = "0000"),
# or: mutate(CrimeTime = if_else(CrimeTime == "2400" , "0000", CrimeTime ),
CrimeTime = parse_time(CrimeTime, format = "%H%M"))
table_5 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 5) %>%
mutate(CrimeTime = if_else(str_detect(CrimeTime, ":"),
CrimeTime,
as.character(parse_number(CrimeTime))) ,
CrimeTime = if_else(str_detect(CrimeTime, ":"),
CrimeTime,
str_c(str_sub(CrimeTime,1,2),
str_sub(CrimeTime,3,), sep = ":")),
CrimeTime = parse_time(CrimeTime, format = "%H:%M"))
table_7 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 7) %>%
mutate(CrimeTime = parse_time(CrimeTime, format = "%H:%M:%S"))
table_8 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 8) %>%
mutate(CrimeTime = parse_time(CrimeTime, format = "%H:%M:%S" ))
table_10 <- Baltimore_crime %>%
filter(str_length(CrimeTime) == 10) %>%
mutate(CrimeTime = str_sub("0149 01:49",5),
# or str_sub("0149 01:49",5,10)
CrimeTime = parse_time(CrimeTime, format = "%H:%M" ))
#%H:%M default all zero after Minutes
table_NA <- Baltimore_crime %>%
filter(is.na(CrimeTime)) %>%
mutate(CrimeTime = parse_time(CrimeTime, format = "%H:%M:%S" ))
Baltimore_crime_data_exhausted <- table_2 %>%
full_join(table_3) %>%
full_join(table_4) %>%
full_join(table_5) %>%
full_join(table_7) %>%
full_join(table_8) %>%
full_join(table_10) %>%
full_join(table_NA)
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
## Joining, by = c("CrimeDate", "CrimeTime", "CrimeCode", "Location", "Description", "Inside/Outside", "Weapon", "Post", "District", "Neighborhood", "Longitude", "Latitude", "Location 1", "Premise", "crimeCaseNumber", "Total Incidents")
# or: rbind(table_2,table_3,table_4,table_5,
# table_7,table_8,table_10,table_NA)
# use full_join needs to have same format, including "table_NA"
Baltimore_crime_data_exhausted
#Test area
#Baltimore_crime %>%
#filter(str_length(CrimeTime) == 4, CrimeTime == "2400") %>%
#mutate(CrimeTime = if_else(CrimeTime == "2400" , "0000", CrimeTime ))
###c.Make Location 1 into two columns LocationLat and LocationLon
Baltimore_crime_data_exhausted %>%
separate("Location 1", into = c("LocationLat", "LocationLon"), sep = ",") %>%
mutate(LocationLat = parse_number(LocationLat),
LocationLon = parse_number(LocationLon)) %>%
head()
Midnight_to_4 <- Baltimore_crime_data_exhausted %>%
filter(CrimeTime >= parse_time("00:00:00", format = "%H:%M:%S") &
CrimeTime <= parse_time("04:00:00", format = "%H:%M:%S"))
percentage_of_midnight <- (nrow(Midnight_to_4)/
nrow(Baltimore_crime_data_exhausted))*100
percentage_of_midnight
## [1] 14.03392
billboard <- read_csv(file = "./data/billboard.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## artist = col_character(),
## track = col_character(),
## time = col_time(format = ""),
## date.entered = col_date(format = ""),
## wk66 = col_logical(),
## wk67 = col_logical(),
## wk68 = col_logical(),
## wk69 = col_logical(),
## wk70 = col_logical(),
## wk71 = col_logical(),
## wk72 = col_logical(),
## wk73 = col_logical(),
## wk74 = col_logical(),
## wk75 = col_logical(),
## wk76 = col_logical()
## )
## ℹ Use `spec()` for the full column specifications.
head(billboard)
billboard %>%
pivot_longer(cols = starts_with("wk"),
names_to = "week",
values_to = "ranking",
values_drop_na = TRUE ,
names_prefix = "wk"
)
billboard_revised <- billboard %>%
pivot_longer(cols = starts_with("wk"), names_to = "week",
values_to = "ranking", values_drop_na = TRUE ,names_prefix = "wk") %>%
select(year:time, week, ranking, date.entered ) %>%
rename(date = date.entered) %>%
mutate(week = parse_number(week),
date = date+7*(week-1))
billboard_revised
# week 1 = week+7*0
# week 2 = week+7*1
# week 3 = week+7*2
# week+7 *(week-1)
billboard_revised %>%
arrange(artist,track,week)
iris_data <- read.csv(file = "./data/iris_data.csv", header = FALSE, sep = ",")
iris_real_data <- iris_data %>%
rename("sepal_length" = "V1", "speal_width" = "V2",
"petal_length" = "V3", "petal_width" = "V4", "species" = "V5") %>%
#rename(new = old)
pivot_longer(cols = sepal_length:petal_width,
names_to = "cm", values_to = "value") %>%
separate(col = cm, into = c("sp","dimension")) %>%
mutate(species = recode(species, "Iris-setosa" = "setosa",
"Iris-versicolor" = "versicolor",
"Iris-virginica" = "virginica"))
iris_real_data$sp[iris_real_data$sp == "speal"] <- "sepal"
iris_real_data %>%
ggplot(aes(x = species,y= value)) +
geom_boxplot()+
facet_grid(sp~dimension) + # two category uses fact_grid
theme_bw()+
theme(strip.background = element_rect(colour = "black", fill = "white"))