# load data
rm(list = ls())
library(rvest)
library(psych)
library(ggplot2)
# read data from website
bos_url <- readLines("http://lib.stat.cmu.edu/datasets/boston_corrected.txt")
# remove narrative opening paragraph
bos_url <- bos_url[-c(1:9)]
# transform to data frame with all data in one column
bos_vec <- as.data.frame(bos_url, stringsAsFactors = FALSE)
str(bos_vec)
# extract first row as names
bos_names <- unlist(strsplit(bos_vec[1,1], '\t'))
# initialize empty data frame for final information and set column names
bos_df <- data.frame(matrix(ncol = 21, nrow = nrow(bos_vec) - 1))
colnames(bos_df) <- bos_names
# split data from single column into final data frame
for(i in 2:nrow(bos_df)){
bos_df[i-1,] <- unlist(strsplit(bos_vec[i,1], '\t'))
}
str(bos_df)
bos_df$DIS <- as.numeric(bos_df$DIS)
bos_df$MEDV <- as.numeric(bos_df$MEDV)You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Does proximity to the Charles River (categorical variable) and/or distance from a working center (numerical variable) correlate with the median value of homes?
What are the cases, and how many are there?
The cases are housing and price data points from various towns outside of Boston in 1978. There are 506 cases.
Describe the method of data collection.
The data was collected by the original researchers via Boston SMSA data (to research further)
What type of study is this (observational/experiment)?
This study is observational.
If you collected the data, state self-collected. If not, provide a citation/link.
The data were imported as shown above from http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/boston.html.
What is the response variable, and what type is it (numerical/categorical)?
The response variable is median value of homes (MEDV), and it is numerical.
What is the explanatory variable, and what type is it (numerical/categorival)?
The explanatory variables are proximity to the Charles River (categorical) and distance from a working center (numerical).
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
describe(bos_df$DIS)## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 506 3.8 2.11 3.21 3.54 1.91 1.13 12.13 11 1.01 0.46
## se
## X1 0.09
describe(bos_df$MEDV)## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 506 22.53 9.2 21.2 21.56 5.93 5 50 45 1.1 1.45 0.41
table(bos_df$CHAS, useNA = 'ifany')##
## 0 1 <NA>
## 471 35 1
prop.table(table(bos_df$CHAS, useNA = 'ifany')) * 100##
## 0 1 <NA>
## 92.8994083 6.9033531 0.1972387
describeBy(bos_df$MEDV, group = bos_df$CHAS, mat=TRUE)## item group1 vars n mean sd median trimmed mad min
## X11 1 0 1 471 22.09384 8.831362 20.9 21.23793 6.07866 5.0
## X12 2 1 1 35 28.44000 11.816643 23.3 27.62414 8.15430 13.4
## max range skew kurtosis se
## X11 50 45.0 1.0501858 1.4932025 0.4069277
## X12 50 36.6 0.8637217 -0.6822376 1.9973773
ggplot(bos_df, aes(x=DIS)) + geom_histogram()ggplot(bos_df, aes(x=MEDV)) + geom_histogram()