Name of the student: "Mary Chaitra"
Reg No: "2023MDTS07ALA033"
Assignment submitted to : "K A Venkatesh"
Program Name & Semester and University Name: "MSc DataScience, 3rd semester, Alliance University"
Date of submission: "2024-09-29"
The "Liquor Licenses" dataset provides information on the locations and owners of liquor license holders within Baltimore City. It includes details such as license types, and locations, and is intended for public access.
It is categorized as geospatial data and is maintained by the City of Baltimore.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ neuralnet::compute() masks dplyr::compute()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the dataset
LL<- read.csv(file.choose())
head(LL)
## LLKey CorpKey LicenseClass SubClass LicenseNumber LicenseDate
## 1 100005 201005 LBD7 BWL 171 2003/05/01 04:00:00+00
## 2 100005 201005 LBD7 BWL 171 2003/05/01 04:00:00+00
## 3 100026 201026 LBD7 BWL 403 2003/05/01 04:00:00+00
## 4 100034 201034 WB BW 6 2003/05/01 04:00:00+00
## 5 100034 201034 WB BW 6 2003/05/01 04:00:00+00
## 6 100035 201035 LBD7 BWL 159 2003/05/01 04:00:00+00
## LicenseEndDate LicenseYear LicenseFee CertificateNumber LicenseStatus
## 1 2004/04/30 04:00:00+00 2003 1320 1272 Renewed
## 2 2004/04/30 04:00:00+00 2003 1320 1272 Renewed
## 3 2004/04/30 04:00:00+00 2003 1320 1506 Renewed
## 4 2004/04/30 04:00:00+00 2003 165 0066 Renewed
## 5 2004/04/30 04:00:00+00 2003 165 0066 Renewed
## 6 2004/04/30 04:00:00+00 2003 1320 1260 Renewed
## LicenseeFirstName LicenseeLastName TradeName
## 1 JOSEPH O. MYNAR RED ROOSTER
## 2 STEPHEN MYNAR RED ROOSTER
## 3 JOY MARTIN (IND&PR) CLUB CHARLES-ZODIAC
## 4 BOK RYE OH CHARLIE & DEE CARRYOUT
## 5 YOUNG H. OH CHARLIE & DEE CARRYOUT
## 6 EDWARD BAZEMORE GLASS DOOR LOUNGE
## CorpName EstablishmentDesc DayPerWeek Description
## 1 RED ROOSTER, INC. Tavern 7 days Beer, Wine, & Liquor
## 2 RED ROOSTER, INC. Tavern 7 days Beer, Wine, & Liquor
## 3 MARTIN & KAHOE, INC. Tavern 7 days Beer, Wine, & Liquor
## 4 N/A Restaurant License 7 days Beer & Light Wine
## 5 N/A Restaurant License 7 days Beer & Light Wine
## 6 GLASS DOOR LOUNGE, INC. Tavern 7 days Beer, Wine, & Liquor
## AddrStreet AddrZip ESRI_OID
## 1 6412-16 FRANKFORD AVENUE 21206 3
## 2 6412-16 FRANKFORD AVENUE 21206 1
## 3 1724-26 CHARLES STREET NORTH 21201 2
## 4 1539 BUSH STREET 21230 4
## 5 1539 BUSH STREET 21230 5
## 6 1263 FAYETTE STREET EAST 21202 6
View(LL)
str(LL)
## 'data.frame': 29603 obs. of 21 variables:
## $ LLKey : int 100005 100005 100026 100034 100034 100035 100035 100038 100038 100039 ...
## $ CorpKey : int 201005 201005 201026 201034 201034 201035 201035 201038 201038 201039 ...
## $ LicenseClass : chr "LBD7" "LBD7" "LBD7" "WB" ...
## $ SubClass : chr "BWL" "BWL" "BWL" "BW" ...
## $ LicenseNumber : int 171 171 403 6 6 159 159 391 391 530 ...
## $ LicenseDate : chr "2003/05/01 04:00:00+00" "2003/05/01 04:00:00+00" "2003/05/01 04:00:00+00" "2003/05/01 04:00:00+00" ...
## $ LicenseEndDate : chr "2004/04/30 04:00:00+00" "2004/04/30 04:00:00+00" "2004/04/30 04:00:00+00" "2004/04/30 04:00:00+00" ...
## $ LicenseYear : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ LicenseFee : num 1320 1320 1320 165 165 1320 1320 1320 1320 1320 ...
## $ CertificateNumber: chr "1272" "1272" "1506" "0066" ...
## $ LicenseStatus : chr "Renewed" "Renewed" "Renewed" "Renewed" ...
## $ LicenseeFirstName: chr "JOSEPH O." "STEPHEN" "JOY" "BOK RYE" ...
## $ LicenseeLastName : chr "MYNAR" "MYNAR" "MARTIN (IND&PR)" "OH" ...
## $ TradeName : chr "RED ROOSTER" "RED ROOSTER" "CLUB CHARLES-ZODIAC" "CHARLIE & DEE CARRYOUT" ...
## $ CorpName : chr "RED ROOSTER, INC." "RED ROOSTER, INC." "MARTIN & KAHOE, INC." "N/A" ...
## $ EstablishmentDesc: chr "Tavern" "Tavern" "Tavern" "Restaurant License" ...
## $ DayPerWeek : chr "7 days" "7 days" "7 days" "7 days" ...
## $ Description : chr "Beer, Wine, & Liquor" "Beer, Wine, & Liquor" "Beer, Wine, & Liquor" "Beer & Light Wine" ...
## $ AddrStreet : chr "6412-16 FRANKFORD AVENUE" "6412-16 FRANKFORD AVENUE" "1724-26 CHARLES STREET NORTH" "1539 BUSH STREET" ...
## $ AddrZip : int 21206 21206 21201 21230 21230 21202 21202 21231 21231 21201 ...
## $ ESRI_OID : int 3 1 2 4 5 6 7 8 9 10 ...
# Check for NA values
sum(is.na(LL))
## [1] 1
glimpse(LL)
## Rows: 29,603
## Columns: 21
## $ LLKey <int> 100005, 100005, 100026, 100034, 100034, 100035, 1000…
## $ CorpKey <int> 201005, 201005, 201026, 201034, 201034, 201035, 2010…
## $ LicenseClass <chr> "LBD7", "LBD7", "LBD7", "WB", "WB", "LBD7", "LBD7", …
## $ SubClass <chr> "BWL", "BWL", "BWL", "BW", "BW", "BWL", "BWL", "BWL"…
## $ LicenseNumber <int> 171, 171, 403, 6, 6, 159, 159, 391, 391, 530, 530, 4…
## $ LicenseDate <chr> "2003/05/01 04:00:00+00", "2003/05/01 04:00:00+00", …
## $ LicenseEndDate <chr> "2004/04/30 04:00:00+00", "2004/04/30 04:00:00+00", …
## $ LicenseYear <int> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003…
## $ LicenseFee <dbl> 1320.0, 1320.0, 1320.0, 165.0, 165.0, 1320.0, 1320.0…
## $ CertificateNumber <chr> "1272", "1272", "1506", "0066", "0066", "1260", "126…
## $ LicenseStatus <chr> "Renewed", "Renewed", "Renewed", "Renewed", "Renewed…
## $ LicenseeFirstName <chr> "JOSEPH O.", "STEPHEN", "JOY", "BOK RYE", "YOUNG H."…
## $ LicenseeLastName <chr> "MYNAR", "MYNAR", "MARTIN (IND&PR)", "OH", "OH", "BA…
## $ TradeName <chr> "RED ROOSTER", "RED ROOSTER", "CLUB CHARLES-ZODIAC",…
## $ CorpName <chr> "RED ROOSTER, INC.", "RED ROOSTER, INC.", "MARTIN & …
## $ EstablishmentDesc <chr> "Tavern", "Tavern", "Tavern", "Restaurant License", …
## $ DayPerWeek <chr> "7 days", "7 days", "7 days", "7 days", "7 days", "7…
## $ Description <chr> "Beer, Wine, & Liquor", "Beer, Wine, & Liquor", "Bee…
## $ AddrStreet <chr> "6412-16 FRANKFORD AVENUE", "6412-16 FRANKFORD AVENU…
## $ AddrZip <int> 21206, 21206, 21201, 21230, 21230, 21202, 21202, 212…
## $ ESRI_OID <int> 3, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
summary(LL)
## LLKey CorpKey LicenseClass SubClass
## Min. :100005 Min. :201000 Length:29603 Length:29603
## 1st Qu.:310363 1st Qu.:201469 Class :character Class :character
## Median :319017 Median :201942 Mode :character Mode :character
## Mean :312444 Mean :201966
## 3rd Qu.:325450 3rd Qu.:202423
## Max. :330520 Max. :203173
##
## LicenseNumber LicenseDate LicenseEndDate LicenseYear
## Min. : 1.0 Length:29603 Length:29603 Min. :2002
## 1st Qu.: 25.0 Class :character Class :character 1st Qu.:2010
## Median : 91.0 Mode :character Mode :character Median :2016
## Mean :129.3 Mean :2015
## 3rd Qu.:210.0 3rd Qu.:2021
## Max. :538.0 Max. :2024
## NA's :1
## LicenseFee CertificateNumber LicenseStatus LicenseeFirstName
## Min. : 0 Length:29603 Length:29603 Length:29603
## 1st Qu.: 550 Class :character Class :character Class :character
## Median : 1100 Mode :character Mode :character Mode :character
## Mean : 1242
## 3rd Qu.: 1320
## Max. :22280
##
## LicenseeLastName TradeName CorpName EstablishmentDesc
## Length:29603 Length:29603 Length:29603 Length:29603
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## DayPerWeek Description AddrStreet AddrZip
## Length:29603 Length:29603 Length:29603 Min. :21201
## Class :character Class :character Class :character 1st Qu.:21205
## Mode :character Mode :character Mode :character Median :21217
## Mean :21216
## 3rd Qu.:21224
## Max. :21251
##
## ESRI_OID
## Min. : 1
## 1st Qu.: 3701
## Median : 7401
## Mean : 8675
## 3rd Qu.:13702
## Max. :21103
##
# Summarize missing values for each column
colSums(is.na(LL))
## LLKey CorpKey LicenseClass SubClass
## 0 0 0 0
## LicenseNumber LicenseDate LicenseEndDate LicenseYear
## 1 0 0 0
## LicenseFee CertificateNumber LicenseStatus LicenseeFirstName
## 0 0 0 0
## LicenseeLastName TradeName CorpName EstablishmentDesc
## 0 0 0 0
## DayPerWeek Description AddrStreet AddrZip
## 0 0 0 0
## ESRI_OID
## 0
# Show only columns with missing values
missing_values <- colSums(is.na(LL))
missing_values[missing_values > 0]
## LicenseNumber
## 1
# Replace all NA values with 0
LL[is.na(LL)] <- 0
colSums(is.na(LL))
## LLKey CorpKey LicenseClass SubClass
## 0 0 0 0
## LicenseNumber LicenseDate LicenseEndDate LicenseYear
## 0 0 0 0
## LicenseFee CertificateNumber LicenseStatus LicenseeFirstName
## 0 0 0 0
## LicenseeLastName TradeName CorpName EstablishmentDesc
## 0 0 0 0
## DayPerWeek Description AddrStreet AddrZip
## 0 0 0 0
## ESRI_OID
## 0
EXPLORATORY DATA ANALYSIS
ggplot(LL, aes(x = LicenseClass)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Liquor License Types", x = "License class", y = "Count")
# Scatter plot for License Year vs License Fee
ggplot(LL, aes(x = LicenseYear, y = LicenseFee)) +
geom_point(alpha = 0.5, color = 'maroon') +
theme_minimal() +
labs(title = "Scatter Plot of License Year vs License Fee",
x = "License Year", y = "License Fee")
ggplot(LL, aes(x = EstablishmentDesc)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Establishment of Liquor License ", x = "EstablishmentDesc", y = "Count")
Linear Regression Model
# Linear regression: predicting License_Fee using License_Year
linear_model <- lm(LicenseFee ~ LicenseYear, data = LL)
# View model summary
summary(linear_model)
##
## Call:
## lm(formula = LicenseFee ~ LicenseYear, data = LL)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1547.4 -612.3 -183.2 234.7 20694.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -76366.464 2562.291 -29.80 <2e-16 ***
## LicenseYear 38.514 1.272 30.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1361 on 29601 degrees of freedom
## Multiple R-squared: 0.03006, Adjusted R-squared: 0.03003
## F-statistic: 917.4 on 1 and 29601 DF, p-value: < 2.2e-16
The linear regression model shows a low R-squared value of 0.03006, indicating that only about 3% of the variance in the license fee is explained by the license year, suggesting a weak relationship between these variables. The p-value (< 2.2e-16) is highly significant, meaning that the relationship, although weak, is statistically significant. The Residual Standard Error (1361) shows the typical distance that the observed license fees fall from the predicted line, suggesting that predictions will vary widely around the actual values. Overall, the model is statistically significant but not strong.
CLUSTERING
library(cluster)
## Warning: package 'cluster' was built under R version 4.3.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
LL <- LL%>% select_if(is.numeric)
LL_scaled <- scale(LL)
set.seed(123)
km.res<-kmeans(LL,4,nstart=25)
fviz_cluster(km.res,data=LL_scaled)
set.seed(123) # For reproducibility
sample_indices <- sample(1:nrow(LL_scaled), 500)
sample_LL <- LL_scaled[sample_indices, ]
dist_matrix <- dist(sample_LL)
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot the dendrogram
plot(hc, main = "Hierarchical Clustering Dendrogram", xlab = "Sample Index", ylab = "Height")
set.seed(123) # For reproducibility
sample_indices <- sample(1:nrow(LL_scaled), 100)
sample_LL <- LL_scaled[sample_indices, ]
dist_matrix <- dist(sample_LL)
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot the dendrogram
plot(hc, main = "Hierarchical Clustering Dendrogram", xlab = "Sample Index", ylab = "Height")
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.3.3
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
res.dist2<-dist(sample_LL, method = "euclidean")
res.hcr<-hclust(d = res.dist2, method = "ward.D2")
fviz_dend(res.hcr, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
res.hcr1<- hclust(res.dist2, method= "average")
cor(res.dist2, cophenetic(res.hcr1))
## [1] 0.8453741
fviz_dend(res.hcr, cex = 0.5, k_colors=c('brown','yellow','green','blue'), rect = TRUE)
res.dsitr<-dist(sample_LL, method = "euclidean")
hcr<-hclust(res.dsitr, method = "average")
hcr1<-hclust(res.dsitr, method = "ward.D2")
denr<- as.dendrogram(hcr)
denr2<- as.dendrogram(hcr1)
dend_listr<- dendlist(denr, denr2)
tanglegram(denr, denr2)
dim(LL)
## [1] 29603 7
sample_indices <- sample(1:nrow(LL_scaled), 50)
sample_data <- LL_scaled[sample_indices, ]
dist_matrix <- dist(sample_data)
LL1.scaled<- scale(sample_indices)
res.dsits<- dist(sample_indices, method = "euclidean")
hcs<- hclust(res.dsits, method = "average")
hcss<- hclust(res.dsits, method = "ward.D2")
den34<- as.dendrogram(hcs)
den35<- as.dendrogram(hcss)
dend_lists<- dendlist(den34, den35)
dend_lists
## [[1]]
## 'dendrogram' with 2 branches and 50 members total, at height 13877.08
##
## [[2]]
## 'dendrogram' with 2 branches and 50 members total, at height 66609.97
##
## attr(,"class")
## [1] "dendlist"
tanglegram(den34, den35)