Dataset References
The following datasets were considered for this Project.
Mecklenburg County Great Schools Report - Untidy data with most of the values in one column.
Montgomery Count Crash Report - Contains Report of crashes across years.
[https://data.montgomerycountymd.gov/Public-Safety/Crash-Reporting-Incidents-Data/bhju-22kf/data]
AIRBNB NYC Data - Neighbourhood information of different types of accomodation options.
[https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/downloads/AB_NYC_2019.csv/3]
Approach
Below is the standard approach followed for all datasets used in this project.
Below are different Analysis carried out in the county schools database.
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
raw_meck_data <- data.frame(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Week6_Project/Mecklenburg_Elem_Data.csv"))
raw_meck_data$School <- str_trim(str_replace_all(raw_meck_data$School,"Homes for sale",""))
raw_meck_data <- raw_meck_data[!apply(is.na(raw_meck_data) | raw_meck_data == "" | raw_meck_data == " ", 1, all),]
raw_meck_data_split1 <- data.frame(raw_meck_data$School)
raw_meck_data_split1[,2]=""
raw_meck_data_split1[,3]=""
names(raw_meck_data_split1)[1]="val"
names(raw_meck_data_split1)[2]="attribute"
names(raw_meck_data_split1)[3]="id"
i=1
j=1
raw_meck_data_split1[1,2]="Date"
raw_meck_data_split1[2,2]="Rating"
raw_meck_data_split1[3,2]="School Name"
raw_meck_data_split1[4,2]="Address"
while(i<=nrow(raw_meck_data_split1))
{
raw_meck_data_split1[i,3]=j
if(i%%4==0 & i!=nrow(raw_meck_data_split1))
{
j=j+1
raw_meck_data_split1[i+1,2]="Date"
raw_meck_data_split1[i+2,2]="Rating"
raw_meck_data_split1[i+3,2]="School Name"
raw_meck_data_split1[i+4,2]="Address"
}
i=i+1
}
raw_meck_data_split2 <- raw_meck_data[2:7]
raw_meck_data_split2 <- raw_meck_data_split2[!apply(is.na(raw_meck_data_split2) | raw_meck_data_split2 == "" | raw_meck_data_split2 == " ", 1, all),]
raw_meck_data_split1 <- spread(raw_meck_data_split1,attribute,val)
raw_meck_data_final <- data.frame(cbind(raw_meck_data_split1,raw_meck_data_split2[!names(raw_meck_data_split2) %in% names(raw_meck_data_split1)]))
raw_meck_data_final$id <- seq(1,nrow(raw_meck_data_final))
Analysis 1 - School with More Number of Reviews
raw_meck_data_plot_reviews <- raw_meck_data_final %>%
mutate(no_of_reviews = as.numeric(ifelse(Reviews == "No reviews yet",0,str_extract_all(Reviews,"[[:digit:]]+")))) %>%
select(School.Name,no_of_reviews) %>%
filter(no_of_reviews > 50) %>%
arrange(desc(no_of_reviews)) %>%
do(head(.,n=5))
ggplot(raw_meck_data_plot_reviews, aes(x=School.Name, y=no_of_reviews)) +
geom_bar(stat="identity",fill="#992266") + coord_flip() + theme_minimal() + theme(axis.line = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
Analysis 2 - Rating of Schools with Number of Students
raw_meck_data_plot_no_of_stu_by_Rating <- raw_meck_data_final %>%
mutate(no_of_stu = as.numeric(Total.students.enrolled)) %>%
mutate(zip_code = as.numeric(str_trim(sapply(str_extract_all(Address,"[[:digit:][:digit:][:digit:][:digit:][:digit:]]+"),tail,1)))) %>%
filter(zip_code=="28210") %>%
select(School.Name,no_of_stu,Rating) %>%
group_by(School.Name,Rating) %>%
summarise(total_by_Rating = sum(no_of_stu))
ggplot(raw_meck_data_plot_no_of_stu_by_Rating, aes(fill=School.Name, y=total_by_Rating, x=Rating)) + geom_bar( stat="identity", position="fill")
Analysis 3 - Number of Students by Zip Code
raw_meck_data_plot_no_of_stu <- raw_meck_data_final %>%
mutate(zip_code = as.numeric(str_trim(sapply(str_extract_all(Address,"[[:digit:][:digit:][:digit:][:digit:][:digit:]]+"),tail,1)))) %>%
mutate(no_of_stu = as.numeric(Total.students.enrolled)) %>%
select(zip_code,no_of_stu) %>%
group_by(zip_code) %>%
summarise(total_by_zip = sum(no_of_stu))
ggplot(data=raw_meck_data_plot_no_of_stu, aes(zip_code,total_by_zip))+geom_jitter()
Below are different Analysis carried out in the NYC BNB Database.
raw_bnb_prep <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Week6_Project/AB_NYC_2019.csv")) %>%
select(id,host_id,host_name,neighbourhood,room_type,price,number_of_reviews)
Analysis 1 - Cheaper and Costlier Neighbourhoods
raw_bnb_cheap <- raw_bnb_prep %>%
select(neighbourhood,price,room_type) %>%
group_by(neighbourhood) %>%
summarise(avg_price = round(mean(price))) %>%
arrange(avg_price) %>%
do(head(.,n=5))
ggplot(raw_bnb_cheap, aes(x="", y=avg_price, fill=neighbourhood)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) +
geom_text(aes(label = avg_price),
position = position_stack(vjust = 0.5)) +
labs(x = NULL, y = NULL, fill = NULL,
title = "Top 5 Cheaper Neighbourhoods") +
guides(fill = guide_legend(reverse = TRUE)) +
scale_colour_gradientn(colours=rainbow(5)) +
theme_classic() +
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
raw_bnb_costly <- raw_bnb_prep %>%
select(neighbourhood,price,id) %>%
group_by(neighbourhood) %>%
summarise(avg_price = round(mean(price))) %>%
arrange(desc(avg_price)) %>%
do(head(.,n=5))
ggplot(raw_bnb_costly, aes(x="", y=avg_price, fill=neighbourhood)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) +
geom_text(aes(label = avg_price),
position = position_stack(vjust = 0.5)) +
labs(x = NULL, y = NULL, fill = NULL,
title = "Top 5 Costlier Neighbourhoods") +
guides(fill = guide_legend(reverse = TRUE)) +
scale_colour_gradientn(colours=rainbow(5)) +
theme_classic() +
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
Analysis 2 - Top 10 Neighbourhoods who provide more accomodation
raw_bnb_no_of_rooms <- raw_bnb_prep %>%
select(neighbourhood,id,room_type) %>%
group_by(neighbourhood) %>%
summarise(no_of_rooms = n()) %>%
arrange(desc(no_of_rooms)) %>%
do(head(.,n=10))
ggplot(raw_bnb_no_of_rooms, aes(x=neighbourhood, y=no_of_rooms)) +
geom_bar(stat="identity",fill="#002266") +coord_flip() +
geom_text(aes(label=no_of_rooms), vjust=-0.3, color="Black", size=3.5) + theme_minimal() + theme(axis.line = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
Analysis 3 - Top 10 Host with More reviews
raw_bnb_hosts_reviews <- raw_bnb_prep %>%
select(host_name,number_of_reviews) %>%
group_by(host_name) %>%
summarise(no_of_reviews = sum(number_of_reviews)) %>%
arrange(desc(no_of_reviews)) %>%
do(head(.,n=10))
ggplot(raw_bnb_hosts_reviews, aes(x=host_name, y=no_of_reviews)) +
geom_bar(stat="identity",fill="#992266") + coord_flip() +
geom_text(aes(label=no_of_reviews), vjust=-0.3, color="Black", size=3.5) + theme_minimal() + theme(axis.line = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
Below are different Analysis carried out in the Montgomery County Crash Database.
raw_crash_prep <- data.table(read.csv("https://raw.githubusercontent.com/jey1987/DATA607/master/Week6_Project/Crash_Reporting_-_Incidents_Data.csv")) %>%
select(Crash.Date.Time,Report.Number,Local.Case.Number,Hit.Run,Mile.Point.Direction,Light,Traffic.Control,Driver.Substance.Abuse) %>%
mutate(Crash.Year = format(as.Date(Crash.Date.Time, format="%d/%m/%Y"),"%Y")) %>%
mutate(Crash.Month = format(as.Date(Crash.Date.Time, format="%d/%m/%Y"),"%m"))
Analysis 1 - Number of Crashses by Year
raw_crash_by_year <- raw_crash_prep %>%
select(Local.Case.Number,Crash.Year) %>%
group_by(Crash.Year) %>%
summarise(no_by_year = n())
ggplot(raw_crash_by_year, aes(x=Crash.Year, y=no_by_year)) + geom_point() + geom_text(aes(label = no_by_year)) + theme_minimal() + ggtitle("Number of Crashes by Year")
Analysis 2 - Number of Crashses by Mile Point Directions
raw_crash_by_mile <- raw_crash_prep %>%
select(Local.Case.Number,Mile.Point.Direction) %>%
group_by(Mile.Point.Direction) %>%
summarise(no_by_mile = n())
ggplot(raw_crash_by_mile, aes(x="", y=no_by_mile, fill=Mile.Point.Direction)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start=0) +
geom_text(aes(label = no_by_mile),
position = position_stack(vjust = 0.5)) +
labs(x = NULL, y = NULL, fill = NULL,
title = "Crashes by Mile Point") +
guides(fill = guide_legend(reverse = TRUE)) +
scale_colour_gradientn(colours=rainbow(5)) +
theme_classic() +
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666")) + ggtitle("Number of Crashes by Mile Point")
Analysis 3 - Number of Hit and Runs over months in 2019 under different light conditions
raw_crash_by_hit <- raw_crash_prep %>%
select(Local.Case.Number,Hit.Run,Crash.Month,Crash.Year,Light) %>%
filter(Hit.Run=="Yes") %>%
filter(Crash.Year=="2019") %>%
group_by(Crash.Month,Crash.Year,Light) %>%
summarise(no_by_hit = n())
ggplot(raw_crash_by_hit, aes(fill=Light, y=no_by_hit, x=Crash.Month)) + geom_bar( stat="identity", position="fill") + theme(axis.text.x = element_text(angle=90)) + xlab("Month") + ylab("Crashes") + ggtitle("Crashes in 2019 by Month and Light Condition")