This is the report of the In this exercise to analyse UK house price data (year: 2018) from the Land Registry.
The source of the data file is “http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2018.txt”
This data is available under the Open Government Licence (UK). More at: http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
Packages needed (download them first).
library("knitr")
library("plyr")
library("dplyr")
library("ggplot2")
library("plotly")
Reading and loading the data file that is going to be used to do the exercise. This file includes standard and additional price paid data transactions received at HM Land Registry in the period from 1 January to 31 December (year: 2018).
#Setting working directory.
setwd("C:/Future_Cities")
#Loading the dataset file (untidy)
HM_Land_Registry_Data_2018<-read.csv("HM_Land_Registry_Data_2018.csv", header=TRUE)
Now is necessary to clean the data and get a tidy data set to work with.
#Let's start by checking the data frame's names of the columns.
colnames(HM_Land_Registry_Data_2018)
## [1] "X.7E86B6FB.B772.458C.E053.6B04A8C0C84C."
## [2] "X350000"
## [3] "X2018.11.30.00.00"
## [4] "S80.1JQ"
## [5] "O"
## [6] "N"
## [7] "F"
## [8] "X28"
## [9] "X"
## [10] "BRIDGE.STREET"
## [11] "X.1"
## [12] "WORKSOP"
## [13] "BASSETLAW"
## [14] "NOTTINGHAMSHIRE"
## [15] "B"
## [16] "A"
#Let’s change the data frame's names of the columns.
colnames(HM_Land_Registry_Data_2018)<-c("Transaction_unique_identifier","Price","Date_of_Transfer","Postcode","Property_Type","Old_or_New","Duration","PAON","SAON","Street","Locality","Town_or_City","District","County","PPD_Category_Type","Record_Status")
colnames(HM_Land_Registry_Data_2018)
## [1] "Transaction_unique_identifier" "Price"
## [3] "Date_of_Transfer" "Postcode"
## [5] "Property_Type" "Old_or_New"
## [7] "Duration" "PAON"
## [9] "SAON" "Street"
## [11] "Locality" "Town_or_City"
## [13] "District" "County"
## [15] "PPD_Category_Type" "Record_Status"
#data frames with mean average price and diversity by postcode.
means_by_postcode <-aggregate(HM_Land_Registry_Data_2018$Price,list(HM_Land_Registry_Data_2018$Postcode),mean,na.rm=TRUE) #mean
diversity_by_postcode <-aggregate(HM_Land_Registry_Data_2018$Price,list(HM_Land_Registry_Data_2018$Postcode),sd,na.rm=TRUE) #standard deviation
#changing the column names of the data frames.
colnames(means_by_postcode)<-c("Postcode", "Average_price")
colnames(diversity_by_postcode)<-c("Postcode", "Standard_Deviation")
#data frames with mean average price and diversity sorted (decreasing) by postcode.
sorted_means_by_postcode<-means_by_postcode[order(means_by_postcode$Average_price, decreasing=TRUE),]
sorted_diversity_by_postcode<-diversity_by_postcode[order(diversity_by_postcode$Standard_Deviation, decreasing=TRUE),]
1.Highest average price (5 postcodes).
sorted_means_by_postcode[1:5,]
## Postcode Average_price
## 473712 W1J 7BT 569200000
## 384998 SE1 9JZ 448500000
## 277413 NE6 1AS 448300979
## 494626 WS1 1RY 415000000
## 435091 SW1A 2EU 325169089
2.Lowest average price (5 postcodes).
sorted_means_by_postcode[(nrow(means_by_postcode)-4):nrow(means_by_postcode),]
## Postcode Average_price
## 423875 SS7 2BN 100
## 448071 TN13 2DE 100
## 473448 W14 8PU 100
## 473556 W1D 5EP 100
## 502941 YO17 7LL 100
3.Highest price diversity (5 postcodes).
sorted_diversity_by_postcode[1:5,]
## Postcode Standard_Deviation
## 472895 W11 4UL 184264956
## 146062 E1W 1AP 179921199
## 86539 CM22 7TA 87348901
## 367917 S2 4FB 86400441
## 435402 SW1X 7LJ 83845041
4.Lowest price diversity (5 postcodes).
sorted_diversity_by_postcode_no_NA<-sorted_diversity_by_postcode[complete.cases(sorted_diversity_by_postcode), ]
sorted_diversity_by_postcode_no_NA[(nrow(sorted_diversity_by_postcode_no_NA)-4):nrow(sorted_diversity_by_postcode_no_NA),]
## Postcode Standard_Deviation
## 505690 YO41 1LP 0
## 505853 YO42 2GU 0
## 506591 YO7 1QG 0
## 506699 YO7 3SU 0
## 506732 YO7 4QG 0
#Subsetting the data frame to only have the information of the Greater London county.
HM_Land_Registry_Data_2018_GL<-HM_Land_Registry_Data_2018[HM_Land_Registry_Data_2018$County=="GREATER LONDON",]
#Dropping orinal factors levels in the subsetted data frame to onlyahve the 33 Boroughs of London.
HM_Land_Registry_Data_2018_GL$District <- factor(HM_Land_Registry_Data_2018_GL$District)
#Checking out the numbers of Boroughs in the data frame.
unique(HM_Land_Registry_Data_2018_GL$District)
## [1] CROYDON BROMLEY KINGSTON UPON THAMES
## [4] LEWISHAM WANDSWORTH GREENWICH
## [7] LAMBETH SOUTHWARK RICHMOND UPON THAMES
## [10] BRENT CITY OF WESTMINSTER NEWHAM
## [13] EALING ISLINGTON CAMDEN
## [16] CITY OF LONDON HARROW BEXLEY
## [19] SUTTON MERTON BARNET
## [22] TOWER HAMLETS BARKING AND DAGENHAM HARINGEY
## [25] HOUNSLOW ENFIELD HILLINGDON
## [28] REDBRIDGE HAVERING WALTHAM FOREST
## [31] KENSINGTON AND CHELSEA HAMMERSMITH AND FULHAM HACKNEY
## 33 Levels: BARKING AND DAGENHAM BARNET BEXLEY BRENT BROMLEY ... WANDSWORTH
ggplot(HM_Land_Registry_Data_2018_GL, aes(x=District, y=Price, fill=District)) +
geom_boxplot() + ggtitle("London Boroughs and Price") + ylim(c(0, 100000)) +
theme(text = element_text(size=8), plot.title = element_text(hjust = 0.5, size="14"),
axis.title.x = element_text(size="12"),axis.title.y = element_text(size="12"),
axis.text.x = element_text(face="bold", color="#000000", angle=90),
axis.text.y = element_text(face="bold", color="#000000", angle=90),
panel.background = element_blank(), legend.position="none")
No information about floor level. The the most similar information is “Property Type” (D = Detached, S = Semi-Detached, T = Terraced, F = Flats/Maisonettes, O = Other). Therefore, it could be possible to try to analize the statistical relationship (correlation) between Price and Property Type.