setwd("/Users/devyanimardia/Downloads/DataWrangling")

if (!require("tidyverse")) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
NYRestaurantInspection2023.tbl <- read_csv("NYRestaurantInspection2023.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 210525 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): DBA, BORO, BUILDING, STREET, PHONE, CUISINE DESCRIPTION, INSPECTIO...
## dbl  (8): CAMIS, ZIPCODE, SCORE, Latitude, Longitude, Community Board, BIN, BBL
## lgl  (6): Location Point, Zip Codes, Community Districts, Borough Boundaries...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NYRestaurantInspection2023.tbl <- as_tibble(NYRestaurantInspection2023.tbl)

1(a). Form a new data frame restricted to restaurants in Queens with cuisine equal to “Pizza”.

queenspizzarest <- NYRestaurantInspection2023.tbl %>% filter(BORO == "Queens", `CUISINE DESCRIPTION` == "Pizza")

1(b). What are the 5 most frequently inspected restaurants (use the variable “DBA”) in the data frame?

queenspizzarest['Inspected'] <- !is.null(queenspizzarest$`INSPECTION DATE`) 
head(queenspizzarest)
freqinsp <- queenspizzarest %>% group_by(DBA) %>% summarise(freqrest = sum(Inspected)) %>% arrange(desc(freqrest))

head(freqinsp, 7)
#Sample of how we can manipulate the data if we are sure the names are duplicates in order to group the data better

m <- queenspizzarest
m["new_DBA"] <- str_remove(m$DBA, 'PIZZA$') 
m["new_DBA"] <- gsub("'", "", m$new_DBA)
m["new_DBA"] <- trimws(m$new_DBA)

m['Inspected'] <- !is.null(m$`INSPECTION DATE`) 


freqinspm <- m %>% group_by(new_DBA) %>% summarise(freqrest = sum(Inspected)) %>% arrange(desc(freqrest))

head(freqinspm, 5)

(1c) On what dates has pizza parlor “SUSANO’S PIZZERIA & RESTAURANT” been inspected?

s <- queenspizzarest %>% filter(DBA == "SUSANO'S PIZZERIA & RESTAURANT") %>% select(DBA, `INSPECTION DATE`, `INSPECTION TYPE`, `VIOLATION DESCRIPTION`)
s
unique(s$`INSPECTION DATE`)
## [1] "01/12/2023" "05/05/2022"
  1. The file “gapminder_2007_gini.tsv” is in the Files > Lecture materials > Week 2 materials folder. It is a subset of the 2007 Gapminder data merged with recent Gini coefficient data (https://en.wikipedia.org/wiki/Gini_coefficient) .
ginidata <- read.delim("/Users/devyanimardia/Downloads/gapminder_2007_gini.tsv",sep="\t")
head(ginidata)

(2a) Create a plot to compare the distributions of the Gini coefficient in different continents. [Hint: Use a boxplot]

ginidata %>% ggplot(aes(continent, gini, fill = continent)) + geom_boxplot(outlier.colour="black", outlier.shape=3, outlier.size=3) + xlab("Continent") + ylab("gini") + theme(legend.position = "top")

(2b) Does the Gini coefficient appear to have any impact on the life expectancy in 2007? Explain your answer using a plot, classified by continents.

ginidata %>% filter(year == 2007) %>% ggplot(aes(lifeExp, gini, color = gini)) + geom_point()  + geom_smooth() + ggtitle("Life Expectancy vs Gini in 2007") + theme(plot.title = element_text(size = 15, hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#OR , the first one shows a clearer relationship, lifeexp increases with decreasing gini

ginidata %>% filter(year == 2007) %>% ggplot(aes(gini, lifeExp, color = gini)) + geom_point()  + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# a plot with a distinction with continents as well but doesn't seem to give a lot of insights

ginidata %>% filter(year == 2007) %>% ggplot(aes(gini, lifeExp, color = continent)) + geom_point()

There is an impact of gini index on life expectancy , they appear to be inversely related

  1. Using the original gapminder data frame, please generate a data frame with a new variable called gdp by multiplying the population size by the gdp per capita. To make those large numbers more understandable, please form an additional new variable called gdp_ratio equal to the gdp divided by the gdp of the United States in 2007. Find the median gdp_ratio by continent and year, and then plot the median gdp_ratio over time, distinguishing the continents. Please use both points and lines for the plot.
library(gapminder)
options(dplyr.summarise.inform = FALSE)

gapminder_new <- gapminder %>% mutate(gdp = pop * gdpPercap)

gdpus <- as.numeric(gapminder_new %>%  filter(country == "United States", year == 2007) %>% select(gdpPercap))

gdpratiodata <- gapminder_new %>% mutate(gdp_ratio = gdp / gdpus)

mediangapminderdata <- gdpratiodata %>% group_by(continent, year) %>% summarise(gdpratio_median=median(gdp_ratio)) %>% arrange(desc(year))

head(mediangapminderdata)
mediangapminderdata %>% ggplot(aes(year, gdpratio_median, color = continent)) + geom_point() + geom_line() + xlab("Year") + ylab("Median GDP_Ratio") + ggtitle("Median Gdp Ratio over time") + theme(legend.position = "top")  + scale_colour_discrete(name = "Continents :") + theme(plot.title = element_text(size = 15, hjust = 0.5))