Name of the student: "Mary Chaitra"
Reg No: "2023MDTS07ALA033"
Assignment submitted to : "K A Venkatesh" 
Program Name & Semester and University Name: "MSc DataScience, 3rd semester, Alliance University"
Date of submission: "2024-09-29"

The "Liquor Licenses" dataset provides information on the locations and owners of liquor license holders within Baltimore City. It includes details such as license types, and locations, and is intended for public access. 
 It is categorized as geospatial data and is maintained by the City of Baltimore.
library(tidyverse) 
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ neuralnet::compute() masks dplyr::compute()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the dataset
LL<- read.csv(file.choose())
head(LL)
##    LLKey CorpKey LicenseClass SubClass LicenseNumber            LicenseDate
## 1 100005  201005         LBD7      BWL           171 2003/05/01 04:00:00+00
## 2 100005  201005         LBD7      BWL           171 2003/05/01 04:00:00+00
## 3 100026  201026         LBD7      BWL           403 2003/05/01 04:00:00+00
## 4 100034  201034           WB       BW             6 2003/05/01 04:00:00+00
## 5 100034  201034           WB       BW             6 2003/05/01 04:00:00+00
## 6 100035  201035         LBD7      BWL           159 2003/05/01 04:00:00+00
##           LicenseEndDate LicenseYear LicenseFee CertificateNumber LicenseStatus
## 1 2004/04/30 04:00:00+00        2003       1320              1272       Renewed
## 2 2004/04/30 04:00:00+00        2003       1320              1272       Renewed
## 3 2004/04/30 04:00:00+00        2003       1320              1506       Renewed
## 4 2004/04/30 04:00:00+00        2003        165              0066       Renewed
## 5 2004/04/30 04:00:00+00        2003        165              0066       Renewed
## 6 2004/04/30 04:00:00+00        2003       1320              1260       Renewed
##   LicenseeFirstName LicenseeLastName              TradeName
## 1         JOSEPH O.            MYNAR            RED ROOSTER
## 2           STEPHEN            MYNAR            RED ROOSTER
## 3               JOY  MARTIN (IND&PR)    CLUB CHARLES-ZODIAC
## 4           BOK RYE               OH CHARLIE & DEE CARRYOUT
## 5          YOUNG H.               OH CHARLIE & DEE CARRYOUT
## 6            EDWARD         BAZEMORE      GLASS DOOR LOUNGE
##                  CorpName  EstablishmentDesc DayPerWeek          Description
## 1       RED ROOSTER, INC.             Tavern     7 days Beer, Wine, & Liquor
## 2       RED ROOSTER, INC.             Tavern     7 days Beer, Wine, & Liquor
## 3    MARTIN & KAHOE, INC.             Tavern     7 days Beer, Wine, & Liquor
## 4                     N/A Restaurant License     7 days    Beer & Light Wine
## 5                     N/A Restaurant License     7 days    Beer & Light Wine
## 6 GLASS DOOR LOUNGE, INC.             Tavern     7 days Beer, Wine, & Liquor
##                     AddrStreet AddrZip ESRI_OID
## 1     6412-16 FRANKFORD AVENUE   21206        3
## 2     6412-16 FRANKFORD AVENUE   21206        1
## 3 1724-26 CHARLES STREET NORTH   21201        2
## 4             1539 BUSH STREET   21230        4
## 5             1539 BUSH STREET   21230        5
## 6     1263 FAYETTE STREET EAST   21202        6
View(LL)

str(LL)
## 'data.frame':    29603 obs. of  21 variables:
##  $ LLKey            : int  100005 100005 100026 100034 100034 100035 100035 100038 100038 100039 ...
##  $ CorpKey          : int  201005 201005 201026 201034 201034 201035 201035 201038 201038 201039 ...
##  $ LicenseClass     : chr  "LBD7" "LBD7" "LBD7" "WB" ...
##  $ SubClass         : chr  "BWL" "BWL" "BWL" "BW" ...
##  $ LicenseNumber    : int  171 171 403 6 6 159 159 391 391 530 ...
##  $ LicenseDate      : chr  "2003/05/01 04:00:00+00" "2003/05/01 04:00:00+00" "2003/05/01 04:00:00+00" "2003/05/01 04:00:00+00" ...
##  $ LicenseEndDate   : chr  "2004/04/30 04:00:00+00" "2004/04/30 04:00:00+00" "2004/04/30 04:00:00+00" "2004/04/30 04:00:00+00" ...
##  $ LicenseYear      : int  2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
##  $ LicenseFee       : num  1320 1320 1320 165 165 1320 1320 1320 1320 1320 ...
##  $ CertificateNumber: chr  "1272" "1272" "1506" "0066" ...
##  $ LicenseStatus    : chr  "Renewed" "Renewed" "Renewed" "Renewed" ...
##  $ LicenseeFirstName: chr  "JOSEPH O." "STEPHEN" "JOY" "BOK RYE" ...
##  $ LicenseeLastName : chr  "MYNAR" "MYNAR" "MARTIN (IND&PR)" "OH" ...
##  $ TradeName        : chr  "RED ROOSTER" "RED ROOSTER" "CLUB CHARLES-ZODIAC" "CHARLIE & DEE CARRYOUT" ...
##  $ CorpName         : chr  "RED ROOSTER, INC." "RED ROOSTER, INC." "MARTIN & KAHOE, INC." "N/A" ...
##  $ EstablishmentDesc: chr  "Tavern" "Tavern" "Tavern" "Restaurant License" ...
##  $ DayPerWeek       : chr  "7 days" "7 days" "7 days" "7 days" ...
##  $ Description      : chr  "Beer, Wine, & Liquor" "Beer, Wine, & Liquor" "Beer, Wine, & Liquor" "Beer & Light Wine" ...
##  $ AddrStreet       : chr  "6412-16 FRANKFORD AVENUE" "6412-16 FRANKFORD AVENUE" "1724-26 CHARLES STREET NORTH" "1539 BUSH STREET" ...
##  $ AddrZip          : int  21206 21206 21201 21230 21230 21202 21202 21231 21231 21201 ...
##  $ ESRI_OID         : int  3 1 2 4 5 6 7 8 9 10 ...
# Check for NA values
sum(is.na(LL))
## [1] 1
glimpse(LL)
## Rows: 29,603
## Columns: 21
## $ LLKey             <int> 100005, 100005, 100026, 100034, 100034, 100035, 1000…
## $ CorpKey           <int> 201005, 201005, 201026, 201034, 201034, 201035, 2010…
## $ LicenseClass      <chr> "LBD7", "LBD7", "LBD7", "WB", "WB", "LBD7", "LBD7", …
## $ SubClass          <chr> "BWL", "BWL", "BWL", "BW", "BW", "BWL", "BWL", "BWL"…
## $ LicenseNumber     <int> 171, 171, 403, 6, 6, 159, 159, 391, 391, 530, 530, 4…
## $ LicenseDate       <chr> "2003/05/01 04:00:00+00", "2003/05/01 04:00:00+00", …
## $ LicenseEndDate    <chr> "2004/04/30 04:00:00+00", "2004/04/30 04:00:00+00", …
## $ LicenseYear       <int> 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003…
## $ LicenseFee        <dbl> 1320.0, 1320.0, 1320.0, 165.0, 165.0, 1320.0, 1320.0…
## $ CertificateNumber <chr> "1272", "1272", "1506", "0066", "0066", "1260", "126…
## $ LicenseStatus     <chr> "Renewed", "Renewed", "Renewed", "Renewed", "Renewed…
## $ LicenseeFirstName <chr> "JOSEPH O.", "STEPHEN", "JOY", "BOK RYE", "YOUNG H."…
## $ LicenseeLastName  <chr> "MYNAR", "MYNAR", "MARTIN (IND&PR)", "OH", "OH", "BA…
## $ TradeName         <chr> "RED ROOSTER", "RED ROOSTER", "CLUB CHARLES-ZODIAC",…
## $ CorpName          <chr> "RED ROOSTER, INC.", "RED ROOSTER, INC.", "MARTIN & …
## $ EstablishmentDesc <chr> "Tavern", "Tavern", "Tavern", "Restaurant License", …
## $ DayPerWeek        <chr> "7 days", "7 days", "7 days", "7 days", "7 days", "7…
## $ Description       <chr> "Beer, Wine, & Liquor", "Beer, Wine, & Liquor", "Bee…
## $ AddrStreet        <chr> "6412-16 FRANKFORD AVENUE", "6412-16 FRANKFORD AVENU…
## $ AddrZip           <int> 21206, 21206, 21201, 21230, 21230, 21202, 21202, 212…
## $ ESRI_OID          <int> 3, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
summary(LL)
##      LLKey           CorpKey       LicenseClass         SubClass        
##  Min.   :100005   Min.   :201000   Length:29603       Length:29603      
##  1st Qu.:310363   1st Qu.:201469   Class :character   Class :character  
##  Median :319017   Median :201942   Mode  :character   Mode  :character  
##  Mean   :312444   Mean   :201966                                        
##  3rd Qu.:325450   3rd Qu.:202423                                        
##  Max.   :330520   Max.   :203173                                        
##                                                                         
##  LicenseNumber   LicenseDate        LicenseEndDate      LicenseYear  
##  Min.   :  1.0   Length:29603       Length:29603       Min.   :2002  
##  1st Qu.: 25.0   Class :character   Class :character   1st Qu.:2010  
##  Median : 91.0   Mode  :character   Mode  :character   Median :2016  
##  Mean   :129.3                                         Mean   :2015  
##  3rd Qu.:210.0                                         3rd Qu.:2021  
##  Max.   :538.0                                         Max.   :2024  
##  NA's   :1                                                           
##    LicenseFee    CertificateNumber  LicenseStatus      LicenseeFirstName 
##  Min.   :    0   Length:29603       Length:29603       Length:29603      
##  1st Qu.:  550   Class :character   Class :character   Class :character  
##  Median : 1100   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 1242                                                           
##  3rd Qu.: 1320                                                           
##  Max.   :22280                                                           
##                                                                          
##  LicenseeLastName    TradeName           CorpName         EstablishmentDesc 
##  Length:29603       Length:29603       Length:29603       Length:29603      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   DayPerWeek        Description         AddrStreet           AddrZip     
##  Length:29603       Length:29603       Length:29603       Min.   :21201  
##  Class :character   Class :character   Class :character   1st Qu.:21205  
##  Mode  :character   Mode  :character   Mode  :character   Median :21217  
##                                                           Mean   :21216  
##                                                           3rd Qu.:21224  
##                                                           Max.   :21251  
##                                                                          
##     ESRI_OID    
##  Min.   :    1  
##  1st Qu.: 3701  
##  Median : 7401  
##  Mean   : 8675  
##  3rd Qu.:13702  
##  Max.   :21103  
## 
# Summarize missing values for each column
colSums(is.na(LL))
##             LLKey           CorpKey      LicenseClass          SubClass 
##                 0                 0                 0                 0 
##     LicenseNumber       LicenseDate    LicenseEndDate       LicenseYear 
##                 1                 0                 0                 0 
##        LicenseFee CertificateNumber     LicenseStatus LicenseeFirstName 
##                 0                 0                 0                 0 
##  LicenseeLastName         TradeName          CorpName EstablishmentDesc 
##                 0                 0                 0                 0 
##        DayPerWeek       Description        AddrStreet           AddrZip 
##                 0                 0                 0                 0 
##          ESRI_OID 
##                 0
# Show only columns with missing values
missing_values <- colSums(is.na(LL))
missing_values[missing_values > 0]
## LicenseNumber 
##             1
# Replace all NA values with 0
LL[is.na(LL)] <- 0

colSums(is.na(LL))
##             LLKey           CorpKey      LicenseClass          SubClass 
##                 0                 0                 0                 0 
##     LicenseNumber       LicenseDate    LicenseEndDate       LicenseYear 
##                 0                 0                 0                 0 
##        LicenseFee CertificateNumber     LicenseStatus LicenseeFirstName 
##                 0                 0                 0                 0 
##  LicenseeLastName         TradeName          CorpName EstablishmentDesc 
##                 0                 0                 0                 0 
##        DayPerWeek       Description        AddrStreet           AddrZip 
##                 0                 0                 0                 0 
##          ESRI_OID 
##                 0

EXPLORATORY DATA ANALYSIS

ggplot(LL, aes(x = LicenseClass)) + 
  geom_bar() + 
  theme_minimal() + 
  labs(title = "Distribution of Liquor License Types", x = "License class", y = "Count")

# Scatter plot for License Year vs License Fee
ggplot(LL, aes(x = LicenseYear, y = LicenseFee)) +
  geom_point(alpha = 0.5, color = 'maroon') +
  theme_minimal() +
  labs(title = "Scatter Plot of License Year vs License Fee",
       x = "License Year", y = "License Fee")

ggplot(LL, aes(x = EstablishmentDesc)) + 
  geom_bar() + 
  theme_minimal() + 
  labs(title = "Distribution of  Establishment of Liquor License ", x = "EstablishmentDesc", y = "Count")

Linear Regression Model

# Linear regression: predicting License_Fee using License_Year
linear_model <- lm(LicenseFee ~ LicenseYear, data = LL)

# View model summary
summary(linear_model)
## 
## Call:
## lm(formula = LicenseFee ~ LicenseYear, data = LL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1547.4  -612.3  -183.2   234.7 20694.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -76366.464   2562.291  -29.80   <2e-16 ***
## LicenseYear     38.514      1.272   30.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1361 on 29601 degrees of freedom
## Multiple R-squared:  0.03006,    Adjusted R-squared:  0.03003 
## F-statistic: 917.4 on 1 and 29601 DF,  p-value: < 2.2e-16

The linear regression model shows a low R-squared value of 0.03006, indicating that only about 3% of the variance in the license fee is explained by the license year, suggesting a weak relationship between these variables. The p-value (< 2.2e-16) is highly significant, meaning that the relationship, although weak, is statistically significant. The Residual Standard Error (1361) shows the typical distance that the observed license fees fall from the predicted line, suggesting that predictions will vary widely around the actual values. Overall, the model is statistically significant but not strong.

CLUSTERING

library(cluster)
## Warning: package 'cluster' was built under R version 4.3.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
LL <- LL%>% select_if(is.numeric)
LL_scaled <- scale(LL)

set.seed(123)
km.res<-kmeans(LL,4,nstart=25)

fviz_cluster(km.res,data=LL_scaled)

set.seed(123)  # For reproducibility
sample_indices <- sample(1:nrow(LL_scaled), 500)
sample_LL <- LL_scaled[sample_indices, ]

dist_matrix <- dist(sample_LL)

hc <- hclust(dist_matrix, method = "ward.D2")

# Plot the dendrogram
plot(hc, main = "Hierarchical Clustering Dendrogram", xlab = "Sample Index", ylab = "Height")

set.seed(123)  # For reproducibility
sample_indices <- sample(1:nrow(LL_scaled), 100)
sample_LL <- LL_scaled[sample_indices, ]

dist_matrix <- dist(sample_LL)

hc <- hclust(dist_matrix, method = "ward.D2")

# Plot the dendrogram
plot(hc, main = "Hierarchical Clustering Dendrogram", xlab = "Sample Index", ylab = "Height")

library(dendextend)
## Warning: package 'dendextend' was built under R version 4.3.3
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
res.dist2<-dist(sample_LL, method = "euclidean")
res.hcr<-hclust(d = res.dist2, method = "ward.D2")
fviz_dend(res.hcr, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

res.hcr1<- hclust(res.dist2, method= "average")
cor(res.dist2, cophenetic(res.hcr1))
## [1] 0.8453741
fviz_dend(res.hcr, cex = 0.5, k_colors=c('brown','yellow','green','blue'), rect = TRUE)

res.dsitr<-dist(sample_LL, method = "euclidean")
hcr<-hclust(res.dsitr, method = "average")
hcr1<-hclust(res.dsitr, method = "ward.D2")



denr<- as.dendrogram(hcr)
denr2<- as.dendrogram(hcr1)


dend_listr<- dendlist(denr, denr2)



tanglegram(denr, denr2)

dim(LL)
## [1] 29603     7
sample_indices <- sample(1:nrow(LL_scaled), 50)
sample_data <- LL_scaled[sample_indices, ]

dist_matrix <- dist(sample_data)

LL1.scaled<- scale(sample_indices)
res.dsits<- dist(sample_indices, method = "euclidean")
hcs<- hclust(res.dsits, method = "average")
hcss<- hclust(res.dsits, method = "ward.D2")

den34<- as.dendrogram(hcs)
den35<- as.dendrogram(hcss)

dend_lists<- dendlist(den34, den35)
dend_lists
## [[1]]
## 'dendrogram' with 2 branches and 50 members total, at height 13877.08 
## 
## [[2]]
## 'dendrogram' with 2 branches and 50 members total, at height 66609.97 
## 
## attr(,"class")
## [1] "dendlist"
tanglegram(den34, den35)