small-scale census analysis

Analysis

I choose dataset named Small_Areas_(Census_2011)_FCC. https://data.smartdublin.ie/dataset/small-areas-census-2011-fcc

Administrative Boundaries as per the Small Areas (Census 2011) in the Fingal County Council Area.Small Areas are areas of population comprising between 50 and 200 dwellings created by The National Institute of Regional and Spatial Analysis(NIRSA) on behalf of the Ordnance Survey Ireland(OSi) in consultation with CSO. Small Areas were designed as the lowest level of geography for the compilation of statistics in line with data protection and generally comprise either complete or part of townlands or neighbourhoods. There is a constraint on Small Areas that they must nest within Electoral Division boundaries. The small area boundaries have been amended in line with population data from Census 2011

options(warn=-1)
library(ggplot2)
library(forcats)
library(dplyr)
library(tidytext)
library(tidyr)
library(wesanderson)
data=read.csv("C:/Users/13587/Desktop/Small_Areas_(Census_2011)_FCC.csv")
data=data[,-c(1,26)]
df <- data[!duplicated(data), ]
table(df$EDNAME)

## 
##                    Airport           Balbriggan Rural 
##                         13                         48 
##           Balbriggan Urban                   Baldoyle 
##                         28                         27 
##                 Balgriffin                Ballyboghil 
##                          6                          3 
##                 Balscadden  Blanchardstown-Abbotstown 
##                          2                         17 
##  Blanchardstown-Blakestown    Blanchardstown-Coolmine 
##                        120                         36 
##     Blanchardstown-Corduff     Blanchardstown-Delwood 
##                         12                         18 
##  Blanchardstown-Mulhuddart    Blanchardstown-Roselawn 
##                         13                          6 
## Blanchardstown-Tyrrelstown    Castleknock-Knockmaroon 
##                          6                         64 
##           Castleknock-Park                 Clonmethan 
##                         18                          3 
##                   Donabate                     Dubber 
##                         30                         21 
##                 Garristown                  Hollywood 
##                          4                          4 
##                Holmpatrick                      Howth 
##                         10                         32 
##               Kilsallaghan                   Kinsaley 
##                          6                         33 
##                Lucan North                       Lusk 
##                          4                         27 
##              Malahide East              Malahide West 
##                         27                         23 
##          Portmarnock North          Portmarnock South 
##                         14                         12 
##                       Rush                   Skerries 
##                         30                         31 
##                     Sutton             Swords-Forrest 
##                         22                         45 
##            Swords-Glasmore          Swords-Lissenhall 
##                         25                         33 
##             Swords-Seatown             Swords Village 
##                         23                         11 
##                   The Ward                   Turnapin 
##                         25                          6

table(df$CSOED)

## 
## 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 
##   13   48   28   27    6    3    2   17  120   36   12   18   13    6    6   64 
## 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 
##   18    3   30   21    4    4   10   32    6   33    4   27   27   23   14   12 
## 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 
##   30   31   22   45   25   33   23   11   25    6

df_new=df[,c("CSOED","OSIED","SMALL_AREA","Male2011","Female2011","Total2011","Occupancy_Rate","PopulationpersqKm","Labour_Market")]
summary(df_new)

##      CSOED          OSIED         SMALL_AREA           Male2011    
##  Min.   :4001   Min.   :267001   Length:938         Min.   : 39.0  
##  1st Qu.:4009   1st Qu.:267028   Class :character   1st Qu.:117.0  
##  Median :4019   Median :267065   Mode  :character   Median :140.0  
##  Mean   :4021   Mean   :267071                      Mean   :143.4  
##  3rd Qu.:4033   3rd Qu.:267118                      3rd Qu.:166.0  
##  Max.   :4042   Max.   :267160                      Max.   :580.0  
##    Female2011      Total2011      Occupancy_Rate  PopulationpersqKm 
##  Min.   : 42.0   Min.   :  82.0   Min.   :1.323   Min.   :   22.39  
##  1st Qu.:122.0   1st Qu.: 240.0   1st Qu.:2.581   1st Qu.: 2689.78  
##  Median :145.0   Median : 287.0   Median :2.928   Median : 5768.52  
##  Mean   :148.7   Mean   : 292.1   Mean   :2.942   Mean   : 5856.30  
##  3rd Qu.:171.0   3rd Qu.: 338.8   3rd Qu.:3.265   3rd Qu.: 8477.57  
##  Max.   :469.0   Max.   :1049.0   Max.   :6.112   Max.   :25040.36  
##  Labour_Market  
##  Min.   : 58.0  
##  1st Qu.:160.0  
##  Median :191.0  
##  Mean   :196.8  
##  3rd Qu.:228.0  
##  Max.   :738.0

ggplot(df, aes(x = as.factor(CSOED),y=Total2011,fill=as.factor(CSOED))) +
  geom_boxplot(stat ="boxplot" ,position ="dodge2" )+
  theme(legend.position="none") +
  labs(title = "Total2011 Boxplot",x = "CSOED")

ggplot(df, aes(x = as.factor(CSOED),y=Labour_Market,fill=as.factor(CSOED))) +
  geom_boxplot(stat ="boxplot" ,position ="dodge2" )+
  theme(legend.position="none") +
  labs(title = "Labour_Market Boxplot",x = "CSOED")

ggplot(df, aes(x = as.factor(CSOED),y=PopulationpersqKm,fill=as.factor(CSOED))) +
  geom_boxplot(stat ="boxplot" ,position ="dodge2" )+
  theme(legend.position="none") +
  labs(title = "PopulationpersqKm Boxplot",x = "CSOED")

df_new$CSOED=as.factor(df_new$CSOED)
df_new$OSIED=as.factor(df_new$OSIED)
df_new$SMALL_AREA=as.factor(df_new$SMALL_AREA)
attach(df_new)
df1=select(df_new, -c(OSIED,SMALL_AREA)) 
df1 <- df1 %>% group_by(CSOED) %>% 
  summarise(across(everything(), sum),
            .groups = 'drop')  %>%
  as.data.frame()
df3=select(df_new, -c(CSOED,OSIED)) 
df3 <- df3 %>% group_by(SMALL_AREA) %>% 
  summarise(across(everything(), sum),
            .groups = 'drop')  %>%
  as.data.frame()

The original data has 1876 obs. of 28 variables: “OBJECTID” “NUTS1” “NUTS1NAME” “NUTS2” “NUTS2NAME” “NUTS3”
“NUTS3NAME” “COUNTY” “COUNTYNAME” “CSOED” “OSIED” “EDNAME”
“SMALL_AREA” “Male2011” “Female2011” “Total2011” “PPOcc2011” “Unocc2011”
“HS2011” “Vacant2011” “PCVac2011” “CREATEDATE” “Occupancy_Rate” “PopulationpersqKm” “Labour_Market” “GlobalID” “Shape__Area” “Shape__Length”
All data from Dublin Southern and Eastern 4 COUNTY Fingal.They have three categorical variables(factors):OSIED,CSOED,SMALL_AREA and CSOED from 4001 to 4042.Each row have a different small area(with duplicate data).CSOED and OSIED have the same corresponding relationship but different level.Compared with other categorical variables i choose “CSOED”,“OSIED”,“SMALL_AREA”,“Male2011”,“Female2011”,“Total2011”,“Occupancy_Rate”,“PopulationpersqKm”,“Labour_Market” as the independent variable interested in analyzing.I reassembled the data following CSOED.

summary(df1)

##      CSOED       Male2011       Female2011      Total2011     Occupancy_Rate   
##  4001   : 1   Min.   :  346   Min.   :  321   Min.   :  667   Min.   :  6.704  
##  4002   : 1   1st Qu.: 1121   1st Qu.: 1178   1st Qu.: 2299   1st Qu.: 23.349  
##  4003   : 1   Median : 2616   Median : 2751   Median : 5366   Median : 54.173  
##  4004   : 1   Mean   : 3202   Mean   : 3322   Mean   : 6524   Mean   : 65.710  
##  4005   : 1   3rd Qu.: 4076   3rd Qu.: 4301   3rd Qu.: 8314   3rd Qu.: 85.288  
##  4006   : 1   Max.   :17730   Max.   :18327   Max.   :36057   Max.   :377.939  
##  (Other):36                                                                    
##  PopulationpersqKm  Labour_Market  
##  Min.   :    96.8   Min.   :  437  
##  1st Qu.: 28154.0   1st Qu.: 1511  
##  Median :104736.1   Median : 3707  
##  Mean   :130790.6   Mean   : 4395  
##  3rd Qu.:163739.8   3rd Qu.: 5355  
##  Max.   :878129.9   Max.   :24293  
##

par(mfrow=c(2,2))
hist(df1$Total2011)
hist(df1$Occupancy_Rate)
hist(df1$PopulationpersqKm)
hist(df1$Labour_Market)

ggplot(df, aes(x = CSOED)) +
  geom_histogram(fill = "cornflowerblue", color = "white")+
  labs(title = "count",x = "CSOED")

ggplot(df, aes(x = OSIED)) +
  geom_histogram(fill = "cornflowerblue", color = "white")+
  labs(title = "count",x = "OSIED")

Histogram of two categorical variables CSOED and OSIED.We can see that CSOED from 4001 to 4042 and OSIED ranges from 267001 to 267160.They all have 42 different levels.Histogram shows that variables are clustered in ranges with small values.They are not normally distributed and are skewed to the right.

######Occupation rate & Total population
df1$CSOED=as.numeric(df1$CSOED)
g=ggplot(df1,aes(CSOED,fill=CSOED)) + 
    geom_bar(aes(y=Total2011),stat="identity")+
    geom_line(aes(y=100*Occupancy_Rate),stat="identity",color="red",size=0.7)+
    labs(title="Occupation rate & Total population",x="CSOED",y="Total population",size=13)+
    scale_y_continuous(sec.axis=sec_axis(~./100,name="Percentage"))
My_Theme = theme(
  axis.title = element_text(size = 20),
  axis.title.x = element_text(size = 14),
  axis.text.x = element_text(size = 10),
  axis.title.y = element_text(size = 14))
g+My_Theme

df1$CSOED=as.factor(df1$CSOED)
########median & Labour_Market
g1=df1 %>%
    mutate(CSOED = fct_reorder(CSOED, Labour_Market, .fun='median')) %>%
    ggplot(aes(CSOED,Labour_Market), fill = CSOED) +
      geom_bar(position = "dodge",stat="identity") +
      geom_col(aes(fill = CSOED)) +
      labs(title = "Labour Market Bar Plot", x = "CSOED", y = "Labour Market")+
      theme(legend.position="none")
g1

#####population####
df1 %>%
  mutate(CSOED = fct_reorder(CSOED, PopulationpersqKm, .fun='median')) %>%
  ggplot(aes(x=reorder(CSOED,Total2011),group=1, y=Total2011, fill=CSOED)) + 
  geom_col() +
  theme(legend.position="none") +
  labs(title = "Population Bar Plot", x = "CSOED", y = "population")

#####employment###
df1['employment_rate']=df1['Labour_Market']/df1['Total2011']
df1 %>%
  mutate(CSOED = fct_reorder(CSOED,employment_rate)) %>%
  ggplot(aes(x=reorder(CSOED,employment_rate),group=1, y=100*employment_rate, fill=CSOED)) + 
  geom_col() +
  theme(legend.position="none") +
  labs(title = "Employment Bar Plot", x = "CSOED", y = "Employment rate %")

In the first plot Total population and Occupation rate have same trend after i adjust axis. Areas with a large number of residents have larger occupancy rates. The second and third plot we can see big differences of number of people in the labour market and population between each area.4009 has the largest population and number of people in labour market.The final plot shows almost all areas have an employment rate between 60% to 80% and CSOED=4001 has the greatest rate of over 80%.

library(broom)
library(sf)
library(gpclib)
library(rgdal)
library(leaflet)
library(viridis)
library(tmap)
my_spdf <- readOGR("C:/Users/13587/Desktop/Small_Areas_(Census_2011)_FCC",
  layer = "Small_Areas_(Census_2011)_FCC")

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\13587\Desktop\Small_Areas_(Census_2011)_FCC", layer: "Small_Areas_(Census_2011)_FCC"
## with 1876 features
## It has 28 fields

#if (!require(gpclib)) install.packages("gpclib", type="source")
#gpclibPermit()
#ogrInfo(dsn="Small_Areas_(Census_2011)_FCC")
#spdf_fortified <- tidy(my_spdf, region = "SMALL_AREA")

tmap_mode("view")
sf::sf_use_s2(FALSE)
summary(my_spdf)

## Object of class SpatialPolygonsDataFrame
## Coordinates:
##         min       max
## x -6.474983 -5.996291
## y 53.353568 53.634745
## Is projected: FALSE 
## proj4string : [+proj=longlat +datum=WGS84 +no_defs]
## Data attributes:
##     OBJECTID         NUTS1            NUTS1NAME            NUTS2          
##  Min.   :   1.0   Length:1876        Length:1876        Length:1876       
##  1st Qu.: 469.8   Class :character   Class :character   Class :character  
##  Median : 938.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 938.5                                                           
##  3rd Qu.:1407.2                                                           
##  Max.   :1876.0                                                           
##   NUTS2NAME            NUTS3            NUTS3NAME            COUNTY         
##  Length:1876        Length:1876        Length:1876        Length:1876       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   COUNTYNAME           CSOED              OSIED              EDNAME         
##  Length:1876        Length:1876        Length:1876        Length:1876       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   SMALL_AREA           Male2011       Female2011      Total2011     
##  Length:1876        Min.   : 39.0   Min.   : 42.0   Min.   :  82.0  
##  Class :character   1st Qu.:117.0   1st Qu.:122.0   1st Qu.: 240.0  
##  Mode  :character   Median :140.0   Median :145.0   Median : 287.0  
##                     Mean   :143.4   Mean   :148.7   Mean   : 292.1  
##                     3rd Qu.:166.0   3rd Qu.:171.0   3rd Qu.: 339.0  
##                     Max.   :580.0   Max.   :469.0   Max.   :1049.0  
##    PPOcc2011        Unocc2011          HS2011        Vacant2011    
##  Min.   : 34.00   Min.   :  0.00   Min.   : 37.0   Min.   :  0.00  
##  1st Qu.: 83.00   1st Qu.:  4.00   1st Qu.: 91.0   1st Qu.:  2.00  
##  Median : 97.00   Median :  7.00   Median :105.0   Median :  4.00  
##  Mean   : 99.31   Mean   : 10.28   Mean   :109.6   Mean   :  7.68  
##  3rd Qu.:113.00   3rd Qu.: 11.00   3rd Qu.:125.0   3rd Qu.:  8.00  
##  Max.   :194.00   Max.   :361.00   Max.   :555.0   Max.   :361.00  
##    PCVac2011       CREATEDATE          Occupancy_      Population      
##  Min.   : 0.000   Length:1876        Min.   :1.323   Min.   :   22.39  
##  1st Qu.: 2.300   Class :character   1st Qu.:2.581   1st Qu.: 2688.85  
##  Median : 4.100   Mode  :character   Median :2.928   Median : 5768.52  
##  Mean   : 6.419                      Mean   :2.942   Mean   : 5856.30  
##  3rd Qu.: 7.500                      3rd Qu.:3.265   3rd Qu.: 8477.71  
##  Max.   :65.000                      Max.   :6.112   Max.   :25040.36  
##    Labour_Mar      GlobalID           Shape__Are         Shape__Len     
##  Min.   : 58.0   Length:1876        Min.   :    6810   Min.   :  340.5  
##  1st Qu.:160.0   Class :character   1st Qu.:   33978   1st Qu.:  860.4  
##  Median :191.0   Mode  :character   Median :   50150   Median : 1101.4  
##  Mean   :196.8                      Mean   :  488323   Mean   : 2256.9  
##  3rd Qu.:228.0                      3rd Qu.:   98589   3rd Qu.: 1667.3  
##  Max.   :738.0                      Max.   :17453613   Max.   :30949.4

tm_shape(my_spdf) +
  tm_polygons(col = "Total2011",palette='-magma',title="Total2011")

tm_shape(my_spdf) +
  tm_polygons(col = "Labour_Mar",palette='-viridis',title="Labour_Market")

tm_shape(my_spdf) +
  tm_polygons(col = "Population",palette='-magma',title="PopulationpersqKm")

In these graphics we can see differences in population,Labour_Mar and PopulationpersqKm in the map.

library(tidyverse)
library(ggtext)
#library(extrafont)

df1 %>%
  select(CSOED, Male2011, Female2011)%>% #select columns of interest
  mutate(diff = Male2011 - Female2011) %>% #calculate difference
  pivot_longer(cols = c(Male2011, Female2011)) %>% #get into long format
  rename(Gender = name, #rename columns
         Total2011= value)-> df1_gender
#head(df1_gender)
Males <- df1_gender %>%
  filter(Gender == "Male2011")
Females <- df1_gender %>%
  filter(Gender == "Female2011")
#head(Females)
p <- ggplot(df1_gender)+
  geom_segment(data = Males,
               aes(x = Total2011, y = CSOED,
                   yend = Females$CSOED, xend = Females$Total2011), #use the $ operator to fetch data from our "Females" tibble
               color = "#aeb6bf",
               size = 4.5, #Note that I sized the segment to fit the points
               alpha = .5) +
  geom_point(aes(x = Total2011, y = CSOED, color = Gender), size = 4, show.legend = TRUE)+
  ggtitle("Gap between Male and Female in different CSOED")

df1_gender %>%
  group_by(Gender) %>%
  summarise(mean = mean(Total2011),
            SE = sd(Total2011)) %>%
  mutate(meanpos = mean + 1 *SE,
         meanneg = mean - 1 *SE)-> stats
stats_males <- stats %>%
  filter(Gender == "Male2011")
stats_females <- stats %>%
  filter(Gender == "Female2011")
#head(stats)
diff <- df1_gender %>% 
  filter(Gender == "Male2011") %>% #you can chose Males of Females, doesn't matter
  mutate(x_pos = Total2011 + (diff/2)) #x position of label (Enrollment value of Males + diff/2)
#head(diff)
p + 
  #add mean and standard deviation for both groups
  geom_rect(xmin = stats_males$meanneg, xmax = stats_males$meanpos,
            ymin = 4001, ymax = 4042, fill = "#aeb6bf", alpha = .05)+
  geom_vline(xintercept = stats_males$mean, linetype = "solid", size = .5, alpha = .8, color = "#762a83")+
  geom_rect(xmin = stats_females$meanneg, xmax = stats_females$meanpos,
            ymin = 4001, ymax = 4042, fill = "#009688", alpha = .05)+  
  geom_vline(xintercept = stats_females$mean, color = "#009688", linetype = "solid",  size = .5, alpha = .8) +
  #add point range
  geom_segment(data = Males, aes(x = Total2011, y = CSOED, yend = Females$CSOED, xend = Females$Total2011),
               color = "#aeb6bf", size = 4.5, alpha = .5) +
  #add points
  geom_point(aes(x = Total2011, y = CSOED, color = Gender), size = 4, show.legend = FALSE) +
  #add point-range labels
  geom_text(data = diff, aes(label = paste("D: ",diff), x = x_pos, y = CSOED), color = "#4a4e4d", size = 2.5) +
  #add title
  ggtitle("Gender gap in different areas")

Comparing the number of females and males in each area I found that CLOSED=4002,4009,4016,4036,4024 the gap between males and females is more than 300.There is little difference in other regions.

small-scale census analysis

YeLiu

2022-12-19

Analysis