Unified 2001 Census Data - Key Statistics

Using the CASWEB service, the majority of the key statistics tables from the 2001 Census were downloaded for both England and Wales. These were saved into a directory and given a naming convention as follows: KS001_E.csv or KS001_W.csv. The number following KS relates to the table numbers.

# Set working directory
setwd("/Users/alex/Dropbox/Projects/2011_Census")

The first step is to get a list of all the 2001 Census files, then import these CSV into R. They have been named using the same convention as the CSV.

census.files <- sub(".csv", "", list.files("2001 Census/"))
for (x in 1:length(census.files)) {
    input <- census.files[x]
    assign(input, read.csv(paste("2001 Census/", input, ".csv", sep = "")))
}

The next stage is to merge each of the Key statistics tables, which at present are separated into England and Wales. These can be combined with a simple rbind as they have the same structure. However, we can add an extra step which checks that the column names match before completing the merge.

# Create a list of the key statistics tables
census.tables <- unique(sub("_E|_W", "", census.files))
for (x in 1:length(census.tables)) {
    input <- census.tables[x]

    # First loop checks that the column names are the same for the E and W
    # objects
    if (identical(colnames(get(paste(input, "_E", sep = ""))), colnames(get(paste(input, 
        "_W", sep = ""))))) {
        assign(input, rbind(get((paste(input, "_E", sep = ""))), get((paste(input, 
            "_W", sep = "")))))
    } else {
        print(input)
    }
}

## [1] "KS006"

The output from the above identifies that the table KS006_E and KS006_W have different column names, and were not merged. The table KS006 relates to ethnicity, and there is an additional attribute for the Welsh data that indicates those people identifying themselves as Welsh. Also, the Welsh data have an a appended to the variable names. For the purposes of this analysis the Welsh variable will be dropped, leaving matching data.

colnames(KS006_E)

##  [1] "Zone.Code" "ks0060001" "ks0060002" "ks0060003" "ks0060004"
##  [6] "ks0060005" "ks0060006" "ks0060007" "ks0060008" "ks0060009"
## [11] "ks0060010" "ks0060011" "ks0060012" "ks0060013" "ks0060014"
## [16] "ks0060015" "ks0060016" "ks0060017"

colnames(KS006_W)

##  [1] "Zone.Code" "ks06a0001" "ks06a0002" "ks06a0003" "ks06a0004"
##  [6] "ks06a0005" "ks06a0006" "ks06a0007" "ks06a0008" "ks06a0009"
## [11] "ks06a0010" "ks06a0011" "ks06a0012" "ks06a0013" "ks06a0014"
## [16] "ks06a0015" "ks06a0016" "ks06a0017" "ks06a0018"

The following code drops the Welsh column, changes the column names and creates the unified KS006 table.

KS006_W$ks06a0018 <- NULL
colnames(KS006_W) <- gsub("a", "", colnames(KS006_W))
colnames(KS006_W) <- gsub("ks0", "ks00", colnames(KS006_W))
KS006 <- rbind(KS006_E, KS006_W)

The next stage is to create a single unified object containing all the Key Statistics data.

Census_2001_Key_Statistics <- as.data.frame(KS001$Zone.Code)
colnames(Census_2001_Key_Statistics) <- "Zone.Code"

for (x in 1:length(census.tables)) {
    input <- census.tables[x]
    Census_2001_Key_Statistics <- data.frame(Census_2001_Key_Statistics, get(input)[match(Census_2001_Key_Statistics[, 
        "Zone.Code"], get(input)[, "Zone.Code"]), ])
    Census_2001_Key_Statistics$Zone.Code.1 <- NULL
}

Process the 2001 output area data into the 2011 boundaries

The first stage is to import the lookup table, and split this into the constituent parts.

# Get lookup
orig_lookup <- read.csv("OA01_OA11_EW_LU.csv")
# Unchanged
unchanged <- subset(orig_lookup, CHGIND == "U")
# Merged
merged <- subset(orig_lookup, CHGIND == "M")
# Split
split <- subset(orig_lookup, CHGIND == "S")
# Other
other <- subset(orig_lookup, CHGIND == "X")

Unchanged output areas are those where the output area is the same in 2011 as 2001. Merged are where the output areas from 2001 have been combined into a new output area for 2011. Split output areas are those from 2001 which were split into multiple output areas in 2011. Other is the most complicated category, and relate to all other possibilities, e.g., combinations of splits and merges etc.

Split Output Area

To manage split output areas, the frequency of postcodes in the 2011 output areas are used to apportion the 2001 data proportionally by the frequency of small user postcodes. This could be supplemented for other metrics, such as delivery points, however, this would require the postcode address file (PAF), which isn't open data.

The National Statistics Postcode Look-up (NSPL) was used for these analysis which contained the new 2011 Output Areas. This was trimmed to remove non England and Wales postcodes, non small user and terminated postcodes.

# Get NSPL data
NSPL <- read.csv("NSPL_NOV_2012_CSV/Data/NSPL_NOV_2012.csv", header = FALSE)
# Remove non current, non E & W, non small user
NSPL_Keep <- subset(NSPL, V6 == 0 & is.na(V5) & V10 != "" & !is.na(V10), select = c("V1", 
    "V5", "V6", "V10"))
# Create postcode frequency for each 2011 OA
Postcodes_2011_OA <- as.data.frame(table(as.character(NSPL_Keep$V10)))

A split means there are multiple 2011 Output Areas for each 2001 Output Area. As such, the frequency of postcodes in the 2011 Output Areas were examined and used to create proportions that could later be applied to the containing 2001 Output Area.

# Create proportions for split postcodes
split <- data.frame(split, Postcodes_2011_OA[match(split[, "OA11CD"], Postcodes_2011_OA[, 
    "Var1"]), ])
library(plyr)
postcode_sums <- ddply(split, .(OA01CDO), summarise, sum = sum(Freq))

split <- data.frame(split, postcode_sums[match(split[, "OA01CDO"], postcode_sums[, 
    "OA01CDO"]), ])
split$proportion <- split$Freq/split$sum
split$Var1 <- NULL
split$OA01CDO.1 <- NULL

One issue with this method is however that not all of the 2011 Output Areas appear to have postcodes within them, or at least, postcodes which appear in the NSPL. The net effect of this is that on the basis of the above code, this leaves a series of the 2001 Output Areas without a proportion score. The magnitude is however very low.

nrow(split[is.na(split$proportion), ])

## [1] 116

As such, a correction was applied to create proportion scores for these Output Areas based on the area of overlap between 2001 and 2011. This code is presented in a little more detail later in relation to the other category; however, is used as is here to create the missing scores. These are appended back onto the split object.

# load spatial packages
library("rgdal")

## Loading required package: sp

## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.9.0, released 2011/12/29 Path to GDAL shared
## files: /Users/alex/Library/R/2.15/library/rgdal/gdal Loaded PROJ.4
## runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480] Path to PROJ.4 shared
## files: /Users/alex/Library/R/2.15/library/rgdal/proj

library("rgeos")

## Loading required package: stringr

## rgeos: (SVN revision 348) GEOS runtime version: 3.3.3-CAPI-1.7.4 Polygon
## checking: TRUE

# read in 2001 boundaries (England)
OA_2001 <- readOGR("OA Boundaries/England_oa_2001.shp", "England_oa_2001")

## OGR data source with driver: ESRI Shapefile 
## Source: "OA Boundaries/England_oa_2001.shp", layer: "England_oa_2001"
## with 165731 features and 14 fields
## Feature type: wkbPolygon with 2 dimensions

# read in 2011 boundaries (England)
OA_2011 <- readOGR("OA Boundaries/England_oa_2011.shp", "England_oa_2011")

## OGR data source with driver: ESRI Shapefile 
## Source: "OA Boundaries/England_oa_2011.shp", layer: "England_oa_2011"
## with 171372 features and 1 fields
## Feature type: wkbPolygon with 2 dimensions

# read in 2001 boundaries (Wales)
OA_2001_W <- readOGR("OA Boundaries/Wales_oa_2001_area.shp", "Wales_oa_2001_area")

## OGR data source with driver: ESRI Shapefile 
## Source: "OA Boundaries/Wales_oa_2001_area.shp", layer: "Wales_oa_2001_area"
## with 9808 features and 11 fields
## Feature type: wkbPolygon with 2 dimensions

# read in 2011 boundaries (Wales)
OA_2011_W <- readOGR("OA Boundaries/Wales_oa_2011.shp", "Wales_oa_2011")

## OGR data source with driver: ESRI Shapefile 
## Source: "OA Boundaries/Wales_oa_2011.shp", layer: "Wales_oa_2011"
## with 10036 features and 1 fields
## Feature type: wkbPolygon with 2 dimensions

# Subset split

split_NA <- split[is.na(split$proportion), ]
split_no_NA <- split[!is.na(split$proportion), ]
out <- NULL

# Run correction
for (x in 1:nrow(split_NA)) {

    old <- as.character(split_NA[x, "OA01CDO"])
    new <- as.character(split_NA[x, "OA11CD"])

    tmp_01 <- OA_2001[OA_2001@data$ONS_LABEL == old, ]
    tmp_11 <- OA_2011[OA_2011@data$LABEL == new, ]
    tmp_intersect <- gIntersection(tmp_01, tmp_11)
    tmp_overlap <- gArea(tmp_intersect)

    tmp_area_01 <- gArea(tmp_01)
    tmp_01_overlap_pct <- tmp_overlap/tmp_area_01
    out <- c(out, tmp_01_overlap_pct)

    remove(tmp_01, tmp_11, tmp_overlap, tmp_area_01, tmp_01_overlap_pct)


}

# Create a unified split object

split_NA <- subset(split_NA, select = c("OA01CDO", "OA11CD"))
split_NA <- cbind(split_NA, out)
colnames(split_NA) <- c("OA01CDO", "OA11CD", "proportion")
split_no_NA <- subset(split_no_NA, select = c("OA01CDO", "OA11CD", "proportion"))
split <- rbind(split_no_NA, split_NA)

Other Output Areas

There is no metadata available on why the Output Areas are fit within the other category, however, the complexity of the cases can be examined if the frequency of the old and new Output Areas are examined. A cross tabulation shows that 171 are 1 - 1 matches, that is they are a direct match. However, there must be some other explanation to why these are being handled by the other category rather than the unchanged; for example, a change in size.

# List of old OA freq
old_freq <- as.data.frame(table(as.character(other$OA01CDO)))
colnames(old_freq) <- c("OA01CDO", "old_N")
# List of new OA freq
new_freq <- as.data.frame(table(as.character(other$OA11CD)))
colnames(new_freq) <- c("OA11CD", "new_N")
# Merge onto other
other <- data.frame(other, old_freq[match(other[, "OA01CDO"], old_freq[, "OA01CDO"]), 
    ])
other$OA01CDO.1 <- NULL
other <- data.frame(other, new_freq[match(other[, "OA11CD"], new_freq[, "OA11CD"]), 
    ])
other$OA11CD.1 <- NULL
print(xtabs(~old_N + new_N, data = other), type = "html", include.rownames = F)

##      new_N
## old_N   1   2   3
##    1  171  10   0
##    2   12  21   7
##    3   10  12   2
##    4    3   1   0
##    5    1   4   0
##    6    8  10   0
##    12   8   4   0

Given the lack of metadata, and the necessity to process these data automatically; the simplest method of apportionment was to use the area of the Output Areas. For each 2001 Output Area, the area of the 2011 Output Area overlapping was calculated; and like the split proportions, will be used later to apportion the 2001 census data. The shapefiles were read in earlier.

Code will be presented to loop through all the lines in the other file, however, to illustrate this simply, an example is presented for - 00EUNM0006 (2001) and E00165705 (2011)

old <- "00EUNM0006"
new <- "E00165705"

tmp_01 <- OA_2001[OA_2001@data$ONS_LABEL == old, ]
tmp_11 <- OA_2011[OA_2011@data$LABEL == new, ]
plot(tmp_01, col = "#6E7B8B", border = "#CAE1FF")
plot(tmp_11, border = "red", add = TRUE)

plot of chunk unnamed-chunk-15

The overlap can be extracted as follows and calculated as a percentage of 2001 Output Area - the example reveals around a 7% overlap.

tmp_intersect <- gIntersection(tmp_01, tmp_11)
tmp_overlap <- sum(sapply(slot(tmp_intersect, "polygons"), slot, "area"))
tmp_area_01 <- sum(sapply(slot(tmp_01, "polygons"), slot, "area"))
tmp_01_overlap_pct <- tmp_overlap/tmp_area_01
tmp_01_overlap_pct

## [1] 0.0745

This analysis can be extended, by looping through each of the other polygon pairs. An extra step is required to use either the England or Welsh Output Areas. Also, and additional loop is added to account for tmp_intersect being null which can happen when the 2001 polygon does not overlap the 2011 polygon - where this happens the area overlap is set to zero. The pairs where this occurs are mapped.


out <- NULL
for (x in 1:nrow(other)) {

    old <- as.character(other[x, "OA01CDO"])
    new <- as.character(other[x, "OA11CD"])

    if (substring(new, 1, 1) == "W") {

        # Intersection Wales
        tmp_01 <- OA_2001_W[OA_2001_W@data$LABEL == old, ]
        tmp_11 <- OA_2011_W[OA_2011_W@data$LABEL == new, ]
        tmp_intersect <- gIntersection(tmp_01, tmp_11)

        if (is.null(tmp_intersect)) {
            #checks if tmp_intersect is null - if so, it assigns it the area of the 01 OA
            tmp_intersect <- gArea(tmp_01)
            tmp_overlap <- 0
            plot(tmp_01, col = "#6E7B8B", border = "#CAE1FF")
            plot(tmp_11, border = "red", add = TRUE)
            title(c(old, new))
        } else {
            tmp_overlap <- gArea(tmp_intersect)
        }


    } else {

        # Intersection England
        tmp_01 <- OA_2001[OA_2001@data$ONS_LABEL == old, ]
        tmp_11 <- OA_2011[OA_2011@data$LABEL == new, ]
        tmp_intersect <- gIntersection(tmp_01, tmp_11)

        if (is.null(tmp_intersect)) {
            #checks if tmp_intersect is null - if so, it assigns it the area of the 01 OA
            tmp_intersect <- gArea(tmp_01)
            tmp_overlap <- 0
            plot(tmp_01, col = "#6E7B8B", border = "#CAE1FF")
            plot(tmp_11, border = "red", add = TRUE)
            title(c(old, new))
        } else {
            tmp_overlap <- gArea(tmp_intersect)
        }
    }

    # Get sum of overlap polygons
    tmp_area_01 <- gArea(tmp_01)
    tmp_01_overlap_pct <- tmp_overlap/tmp_area_01
    out <- c(out, tmp_01_overlap_pct)

    remove(tmp_01, tmp_11, tmp_overlap, tmp_area_01, tmp_01_overlap_pct)

}

plot of chunk unnamed-chunk-17

The final stage is to join the proportion scores back onto the other data frame object.

other <- cbind(other, out)

Creating smaller objects

The next stage is to reduce the dimensions of the lookup objects which will be appended to the census data and various aggregations used to manipulate the 2001 data into the 2011 boundaries.

unchanged <- subset(unchanged, select = c("OA01CDO", "OA11CD"))
merged <- subset(merged, select = c("OA01CDO", "OA11CD"))
# split <- subset(split, select=c('OA01CDO','OA11CD','proportion'))
# ####Split was trimmed ealier!
other <- subset(other, select = c("OA01CDO", "OA11CD", "out"))
colnames(other) <- c("OA01CDO", "OA11CD", "proportion")

Appending the 2001 Census data and creating aggregation into the 2011 boundaries

The first stage is to merge the census data onto all of the lookup objects.

unchanged <- merge(unchanged, Census_2001_Key_Statistics, by.x = "OA01CDO", 
    by.y = "Zone.Code", in.x = TRUE)
merged <- merge(merged, Census_2001_Key_Statistics, by.x = "OA01CDO", by.y = "Zone.Code", 
    in.x = TRUE)
split <- merge(split, Census_2001_Key_Statistics, by.x = "OA01CDO", by.y = "Zone.Code", 
    in.x = TRUE)
other <- merge(other, Census_2001_Key_Statistics, by.x = "OA01CDO", by.y = "Zone.Code", 
    in.x = TRUE)

Unchanged output areas are direct matches, so no further manipulation is necessary, beyond removing the 2001 code.

unchanged$OA01CDO <- NULL

However, for merged, the aggregate of the 2001 data is required. This is achieved using the aggregate function, with a list variable generated from the column names.

library(sqldf)

## Loading required package: DBI

## Loading required package: gsubfn

## Loading required package: proto

## Loading required namespace: tcltk

## Loading Tcl/Tk interface ...

## Could not load tcltk.  Will use slower R code instead.

## Loading required package: chron

## Loading required package: RSQLite

## Loading required package: RSQLite.extfuns


n <- colnames(other)
n <- n[4:length(n)]

merged_out <- as.data.frame(unique(merged$OA11CD))
colnames(merged_out) <- "OA11CD"

# Loop to merge all variables
for (x in 1:length(n)) {
    Cen_Var <- n[x]
    temp <- sqldf(paste("select OA11CD,sum(", Cen_Var, ") as ", Cen_Var, " from merged group by OA11CD", 
        sep = ""))
    merged_out <- merge(merged_out, temp, by = "OA11CD", in.x = TRUE)
    remove(temp)
}

The split and the other lookup objects are both handled by multiplying the 2001 attributes by the proportion scores; and then in the case of other, aggregated (summed) by the new 2011 Output Areas.

# Split
n <- colnames(split)
n <- n[4:length(n)]

split_out <- as.data.frame(split$OA11CD)
colnames(split_out) <- "OA11CD"

for (x in 1:length(n)) {
    Cen_Var <- n[x]
    temp <- sqldf(paste("select OA11CD,", Cen_Var, " * proportion as ", Cen_Var, 
        " from split", sep = ""))
    split_out <- merge(split_out, temp, by = "OA11CD", in.x = TRUE)
}


# Other - create proportional splits
n <- colnames(other)
n <- n[4:length(n)]

other <- cbind(other, 1:nrow(other))

other_out <- as.data.frame(other$OA11CD)
colnames(other_out) <- c("OA11CD")

for (x in 1:length(n)) {
    Cen_Var <- n[x]
    temp <- as.data.frame(round(get("other")[, Cen_Var] * get("other")[, "proportion"], 
        3))
    colnames(temp) <- paste(Cen_Var)
    other_out <- cbind(other_out, temp)
    remove(temp)
}

# Other - create summed values for 2011 output areas
n <- colnames(other_out)
n <- n[2:length(n)]

other_out_merged <- as.data.frame(unique(other_out$OA11CD))
colnames(other_out_merged) <- "OA11CD"

# Loop to merge all variables
for (x in 1:length(n)) {
    Cen_Var <- n[x]
    temp <- sqldf(paste("select OA11CD,sum(", Cen_Var, ") as ", Cen_Var, " from other_out group by OA11CD", 
        sep = ""))
    other_out_merged <- data.frame(other_out_merged, temp[match(other_out_merged[, 
        "OA11CD"], temp[, "OA11CD"]), ])
    other_out_merged$OA11CD.1 <- NULL
    remove(temp)
}

To test that we now have a series of objects containing the correct number of 2011 Output Areas for England and Wales (181,408) we can sum up the rows.

nrow(unchanged) + nrow(merged_out) + nrow(split_out) + nrow(other_out_merged)

## [1] 181408

Finally, we can now create a single unified 2001 Census Table for the 2011 boundaries.

ordering <- sort(colnames(unchanged))
ordering <- c("OA11CD", ordering[1:243])

unchanged <- unchanged[, ordering]
merged <- merged_out[, ordering]
split <- split_out[, ordering]
other <- other_out_merged[, ordering]

Census_2001_2011OA <- rbind(unchanged, merged, split, other)
Census_2001_2011OA <- Census_2001_2011OA[order(Census_2001_2011OA$OA11CD), ]

write.csv(Census_2001_2011OA, "Census_2001_2011OA.csv")