setwd("/Users/devyanimardia/Downloads/DataWrangling")
if (!require("tidyverse")) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
NYRestaurantInspection2023.tbl <- read_csv("NYRestaurantInspection2023.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 210525 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): DBA, BORO, BUILDING, STREET, PHONE, CUISINE DESCRIPTION, INSPECTIO...
## dbl (8): CAMIS, ZIPCODE, SCORE, Latitude, Longitude, Community Board, BIN, BBL
## lgl (6): Location Point, Zip Codes, Community Districts, Borough Boundaries...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NYRestaurantInspection2023.tbl <- as_tibble(NYRestaurantInspection2023.tbl)
1(a). Form a new data frame restricted to restaurants in Queens with cuisine equal to “Pizza”.
queenspizzarest <- NYRestaurantInspection2023.tbl %>% filter(BORO == "Queens", `CUISINE DESCRIPTION` == "Pizza")
1(b). What are the 5 most frequently inspected restaurants (use the variable “DBA”) in the data frame?
queenspizzarest['Inspected'] <- !is.null(queenspizzarest$`INSPECTION DATE`)
head(queenspizzarest)
freqinsp <- queenspizzarest %>% group_by(DBA) %>% summarise(freqrest = sum(Inspected)) %>% arrange(desc(freqrest))
head(freqinsp, 7)
#Sample of how we can manipulate the data if we are sure the names are duplicates in order to group the data better
m <- queenspizzarest
m["new_DBA"] <- str_remove(m$DBA, 'PIZZA$')
m["new_DBA"] <- gsub("'", "", m$new_DBA)
m["new_DBA"] <- trimws(m$new_DBA)
m['Inspected'] <- !is.null(m$`INSPECTION DATE`)
freqinspm <- m %>% group_by(new_DBA) %>% summarise(freqrest = sum(Inspected)) %>% arrange(desc(freqrest))
head(freqinspm, 5)
(1c) On what dates has pizza parlor “SUSANO’S PIZZERIA & RESTAURANT” been inspected?
s <- queenspizzarest %>% filter(DBA == "SUSANO'S PIZZERIA & RESTAURANT") %>% select(DBA, `INSPECTION DATE`, `INSPECTION TYPE`, `VIOLATION DESCRIPTION`)
s
unique(s$`INSPECTION DATE`)
## [1] "01/12/2023" "05/05/2022"
ginidata <- read.delim("/Users/devyanimardia/Downloads/gapminder_2007_gini.tsv",sep="\t")
head(ginidata)
(2a) Create a plot to compare the distributions of the Gini coefficient in different continents. [Hint: Use a boxplot]
ginidata %>% ggplot(aes(continent, gini, fill = continent)) + geom_boxplot(outlier.colour="black", outlier.shape=3, outlier.size=3) + xlab("Continent") + ylab("gini") + theme(legend.position = "top")
(2b) Does the Gini coefficient appear to have any impact on the life
expectancy in 2007? Explain your answer using a plot, classified by
continents.
ginidata %>% filter(year == 2007) %>% ggplot(aes(lifeExp, gini, color = gini)) + geom_point() + geom_smooth() + ggtitle("Life Expectancy vs Gini in 2007") + theme(plot.title = element_text(size = 15, hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#OR , the first one shows a clearer relationship, lifeexp increases with decreasing gini
ginidata %>% filter(year == 2007) %>% ggplot(aes(gini, lifeExp, color = gini)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# a plot with a distinction with continents as well but doesn't seem to give a lot of insights
ginidata %>% filter(year == 2007) %>% ggplot(aes(gini, lifeExp, color = continent)) + geom_point()
There is an impact of gini index on life expectancy , they appear to be
inversely related
library(gapminder)
options(dplyr.summarise.inform = FALSE)
gapminder_new <- gapminder %>% mutate(gdp = pop * gdpPercap)
gdpus <- as.numeric(gapminder_new %>% filter(country == "United States", year == 2007) %>% select(gdpPercap))
gdpratiodata <- gapminder_new %>% mutate(gdp_ratio = gdp / gdpus)
mediangapminderdata <- gdpratiodata %>% group_by(continent, year) %>% summarise(gdpratio_median=median(gdp_ratio)) %>% arrange(desc(year))
head(mediangapminderdata)
mediangapminderdata %>% ggplot(aes(year, gdpratio_median, color = continent)) + geom_point() + geom_line() + xlab("Year") + ylab("Median GDP_Ratio") + ggtitle("Median Gdp Ratio over time") + theme(legend.position = "top") + scale_colour_discrete(name = "Continents :") + theme(plot.title = element_text(size = 15, hjust = 0.5))