library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

working directory

First, set up your working directory where you have your data file and working R file.

setwd("/Users/se776257/OneDrive - University of Central Florida/Desktop/Prof. An/02 Teaching/2024 Spring/PAD 7754 Quantitative Methods/Class Materials/R data/")

If you want to know what is your current working directory

getwd()
## [1] "C:/Users/se776257/OneDrive - University of Central Florida/Desktop/Prof. An/02 Teaching/2024 Spring/PAD 7754 Quantitative Methods/R script"

Import data from your working folder. Make sure that you have the dataset in your working directory. I will import 2022 Housing Inventory Count (HIC) data.
* You may need to install packages to import specific types of files. For instance, to import a xlsx file, you need to import “readxl” package and use “read_excel(”filename.xlsx”)

db <- read.csv("2022_HIC.csv", stringsAsFactors = F)

To describe a dataset and get information on the number of cases and variables

str(db)
## 'data.frame':    29496 obs. of  103 variables:
##  $ Row..                                 : int  539061 539070 539085 539119 539111 567738 545466 545469 539592 539593 ...
##  $ CocState                              : chr  "OH" "OH" "OH" "OH" ...
##  $ CoC                                   : chr  "Akron, Barberton/Summit County CoC" "Akron, Barberton/Summit County CoC" "Akron, Barberton/Summit County CoC" "Akron, Barberton/Summit County CoC" ...
##  $ Coc.ID                                : int  1350 1350 1350 1350 1350 1350 1380 1380 1380 1380 ...
##  $ HudNum                                : chr  "OH-506" "OH-506" "OH-506" "OH-506" ...
##  $ Status                                : chr  "Submitted" "Submitted" "Submitted" "Submitted" ...
##  $ year                                  : int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ Organization.ID                       : int  495 495 495 45277 6182 6182 45629 45629 45477 45477 ...
##  $ Organization.Name                     : chr  "Battered Women's Shelter" "Battered Women's Shelter" "Battered Women's Shelter" "CoC Direct Services" ...
##  $ HMIS.Org.ID                           : chr  "194" "194" "194" "222" ...
##  $ useHmisDb                             : chr  "Yes" "Yes" "Yes" "" ...
##  $ Project.ID                            : int  3313 11648 26680 150406 138239 157898 153524 153526 151415 151416 ...
##  $ Project.Name                          : chr  "Crisis Center" "Step II" "Step III" "CoC ESG COVID Hotel Assistance" ...
##  $ HMIS.Project.ID                       : chr  "195" "196" "197" "223" ...
##  $ HIC.Date                              : chr  "1/25/2022 12:00:00 AM" "1/25/2022 12:00:00 AM" "1/25/2022 12:00:00 AM" "1/25/2022 12:00:00 AM" ...
##  $ Project.Type                          : chr  "ES" "TH" "RRH" "ES" ...
##  $ Bed.Type                              : chr  "F" "" "" "V" ...
##  $ Geo.Code                              : int  390042 390042 390042 390042 399153 399153 19109 19109 19105 19105 ...
##  $ HMIS.Participating                    : int  0 0 0 1 1 1 1 1 1 1 ...
##  $ Inventory.Type                        : chr  "C" "C" "C" "C" ...
##  $ beginsOperationsWithinYear            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Target.Population                     : chr  "DV" "DV" "DV" NA ...
##  $ mcKinneyVentoEsg                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mcKinneyVentoEsgEs                    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ mcKinneyVentoEsgRrh                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ mcKinneyVentoEsgCov                   : int  1 0 0 1 0 1 1 1 1 1 ...
##  $ mcKinneyVentoEsgEsCov                 : int  1 NA NA 1 NA 0 1 0 1 0 ...
##  $ mcKinneyVentoEsgRrhCov                : int  0 NA NA 0 NA 1 0 1 0 1 ...
##  $ mcKinneyVentoCoc                      : int  0 0 1 0 1 0 0 0 0 0 ...
##  $ mcKinneyVentoCocSh                    : int  NA NA 0 NA 0 NA NA NA NA NA ...
##  $ mcKinneyVentoCocTh                    : int  NA NA 0 NA 0 NA NA NA NA NA ...
##  $ mcKinneyVentoCocPsh                   : int  NA NA 0 NA 0 NA NA NA NA NA ...
##  $ mcKinneyVentoCocRrh                   : int  NA NA 1 NA 1 NA NA NA NA NA ...
##  $ mcKinneyVentoCocSro                   : int  NA NA 0 NA 0 NA NA NA NA NA ...
##  $ mcKinneyVentoCocThRrh                 : int  NA NA 0 NA 0 NA NA NA NA NA ...
##  $ mcKinneyVentoSpC                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mcKinneyVentoS8                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mcKinneyVentoShp                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mcKinneyVentoYhdp                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mcKinneyVentoYhdpRenewals             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingVash                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingSsvf                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingGpd                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingGpdBh                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingGpdLd                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingGpdHh                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingGpdCt                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingGpdSith                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingGpdTp                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingHchv                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingHchvCrs                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingHchvSh                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingBcp                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingTlp                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingMgh                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingRhyDp                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingHopwa                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingHopwaHmv                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingHopwaPh                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingHopwaStsf               : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingHopwaTh                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingHopwaCovid              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ federalFundingPih                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingHome                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingHomeArp                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingIndianEhv               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingOther                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ federalFundingOtherSpecify            : chr  "" "" "" "" ...
##  $ housingType                           : chr  "Site-based – single site" "Site-based – single site" "Tenant-based – scattered site" "Tenant-based – scattered site" ...
##  $ Victim.Service.Provider               : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ address1                              : chr  "" "" "" "" ...
##  $ address2                              : chr  "" "" "" "" ...
##  $ city                                  : chr  "" "" "" "" ...
##  $ state                                 : chr  "" "" "" "" ...
##  $ zip                                   : int  NA NA NA 44311 44278 44278 36079 36079 36786 36786 ...
##  $ Beds.HH.w..Children                   : int  45 40 59 NA 3 6 19 8 11 0 ...
##  $ Units.HH.w..Children                  : int  12 11 19 NA 1 2 5 2 3 0 ...
##  $ Veteran.Beds.HH.w..Children           : int  0 0 0 NA 0 0 0 0 0 NA ...
##  $ Youth.Beds.HH.w..Children             : int  0 0 0 NA 3 6 0 0 0 NA ...
##  $ CH.Beds.HH.w..Children                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Beds.HH.w.o.Children                  : int  32 28 NA 6 6 4 33 10 2 1 ...
##  $ Veteran.Beds.HH.w.o.Children          : int  0 0 NA 0 0 0 0 0 0 0 ...
##  $ Youth.Beds.HH.w.o.Children            : int  0 0 NA 0 6 4 0 0 0 0 ...
##  $ CH.Beds.HH.w.o.Children               : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Beds.HH.w..only.Children              : int  NA 0 NA NA NA NA 0 0 0 0 ...
##  $ CH.Beds.HH.w.only.Children            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Year.Round.Beds                       : int  77 68 59 6 9 10 52 18 13 1 ...
##  $ DV.Beds                               : int  77 68 59 NA NA NA NA NA NA NA ...
##  $ Total.Seasonal.Beds                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Availability.Start.Date               : chr  "" "" "" "" ...
##  $ Availability.End.Date                 : chr  "" "" "" "" ...
##  $ O.V.Beds                              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PIT.Count                             : int  24 20 59 6 9 10 52 18 13 1 ...
##  $ Total.Beds                            : int  77 68 59 6 9 10 52 18 13 1 ...
##  $ Updated.On                            : chr  "8/16/2022 7:56:52 PM" "8/16/2022 7:56:30 PM" "4/19/2022 8:49:55 PM" "4/26/2022 3:56:00 PM" ...
##  $ mergedDefunctYear                     : logi  NA NA NA NA NA NA ...
##  $ questionUsesDescriptorElements        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ questionDesiresToUseDescriptorElements: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ notes                                 : chr  "BWS has ES, TH and RRH projects." "BWS has ES, TH and RRH projects. Due to staff turnover and eligibility requirements, the TH project was not uti"| __truncated__ "" "" ...
##   [list output truncated]

How many housing projects are operated in Florida?

table(db$CocState)
## 
##   AK   AL   AR   AZ   CA   CO   CT   DC   DE   FL   GA   GU   HI   IA   ID   IL 
##  156  203  100  409 4666  476  525  360  109 1257  646   29  169  251  115  869 
##   IN   KS   KY   LA   MA   MD   ME   MI   MN   MO   MS   MT   NC   ND   NE   NH 
##  388  161  336  307 1004  454  252  954 1213  566  101  106  731   91  211  186 
##   NJ   NM   NV   NY   OH   OK   OR   PA   PR   RI   SC   SD   TN   TX   UT   VA 
##  798  200  206 2374  907  205  819 1226  138  141  230   87  387 1015  199  630 
##   VI   VT   WA   WI   WV   WY 
##    6  227 1559  497  168   76

If you want to only select cases in Florida

df <- db %>%
  filter(CocState == "FL")

Making graphs

categorical variable

Let’s focus on the type of shelter/housing services

table(df$Project.Type)
## 
##  ES OPH PSH RRH  SH  TH 
## 339  68 298 329   7 216

Simple histogram of frequencies:

ggplot(data = df) + 
  geom_bar(mapping = aes(x = Project.Type))

If instead we want to have proportions:

ggplot(data = df) + 
  geom_bar(mapping = aes(x = Project.Type, y = after_stat(prop), group = 1))

If we want to change the order of the bars in the histogram:

MyVar_order <- c("ES", "SH", "TH", "PSH", "OPH", "RRH")
ggplot(data = df) + 
  geom_bar(mapping = aes(x = Project.Type))+
  scale_x_discrete(limits=MyVar_order)

continuous variable

Now let’s go with number of beds

summary(df$Total.Beds)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   10.00   20.00   39.43   44.00 1010.00

Let’s plot a histogram

ggplot(data = df) + 
     geom_histogram(mapping = aes(x = Total.Beds), binwidth=5)

Binwidth controls the size of the binning.We can set the max value of the number of beds in the graph as 500 to see the pattern without extreme outliers.

ggplot(data = df) + 
  geom_histogram(mapping = aes(x = Total.Beds),col="blue", binwidth=15) + 
  coord_cartesian(xlim = c(0, 500), ylim = c(0, 500))

ggplot(df, aes(x=Total.Beds)) + geom_histogram(aes(y=..density..), binwidth=1, col="grey", fill="white", xlim = c(0, 500), ylim = c(0, 500)) +
    geom_density(alpha=.2, fill="#FF6666")
## Warning in geom_histogram(aes(y = ..density..), binwidth = 1, col = "grey", :
## Ignoring unknown parameters: `xlim` and `ylim`
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.