#5.1 Use R to create the following data frame and assign it to the name my_data.
#library(tidyverse)
word <- c("one", "two", NA, "four", "five")
number <- c(1, NA, 3, 4, 5)
letter <- c(letters[1:5])
my_data <- data.frame(word, number, letter)
my_data
## word number letter
## 1 one 1 a
## 2 two NA b
## 3 <NA> 3 c
## 4 four 4 d
## 5 five 5 e
Write R code to set working directory to desktop and write my_data into a .csv file named “my_data_no_name.csv” without column names.
#setwd("~/Desktop/21fall/2383 R/hw")
library(readr)
write_csv(my_data, "my_data_no_name.csv", col_names = FALSE)
Write R code to write my_data into a delimited file called “my_data_na.csv” with # as the delimiter and use 999 as the indicator for missing values.
write_delim(my_data, "my_data_na.csv", delim = "#" ,na = "999")
###1) For the “my_data_na.csv” file you created in Exercise 2 in Section 5.1, write R code to read the file into an object with name my_data.
library(readr)
my_data <- read.csv("my_data_na.csv")
my_data
## word.number.letter
## 1 one#1#a
## 2 two#999#b
## 3 999#3#c
## 4 four#4#d
## 5 five#5#e
Which of the following are the column names of the d1?
d1 <- read_csv("x,y,z
1,3,5
2,4,6", col_names = FALSE)
## Rows: 3 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): X1, X2, X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d1
## # A tibble: 3 × 3
## X1 X2 X3
## <chr> <chr> <chr>
## 1 x y z
## 2 1 3 5
## 3 2 4 6
We can see that the column names of the d1 is X1, X2, and X3.
What will be the column name(s) of the d1?
d1 <- read_csv("The first line
The second line
The third line
x,y,z
1,3,5")
## Rows: 4 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): The first line
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d1
## Warning: One or more parsing issues, see `problems()` for details
## # A tibble: 4 × 1
## `The first line`
## <chr>
## 1 The second line
## 2 The third line
## 3 x,y,z
## 4 1,3,5
The column name of the d1 is “The first line”.
Use R to create the following data frame and assign it to the name df1.
word1 <- c("one", "two", NA)
number1 <- c(1,NA,3)
df1 <- data.frame(word1, number1)
df1
## word1 number1
## 1 one 1
## 2 two NA
## 3 <NA> 3
word2 <- c("three", NA, "five")
number2 <- c(3,4,5)
df2 <- data.frame(word2, number2)
df2
## word2 number2
## 1 three 3
## 2 <NA> 4
## 3 five 5
Then create a list named my_list with df1 as the first element and df2 as the second element.
library(r02pro)
my_list <- list(df1=df1, df2=df2)
my_list
## $df1
## word1 number1
## 1 one 1
## 2 two NA
## 3 <NA> 3
##
## $df2
## word2 number2
## 1 three 3
## 2 <NA> 4
## 3 five 5
###1) Write R code to set working directory to the desktop, then export my_list into an excel file named list.xlsx. How many sheets are there in the excel file? What’s in each sheet?
setwd("~/Desktop/21fall/2383 R/hw")
library(writexl)
write_xlsx(my_list, "list.xlsx")
From the result, we can see that there are 2 sheets in the excel, and df1 data is in the sheet1, df2 data is in the shee2.
###2) Write R code to import the first two rows and the first column of the second sheet from the excel file you just created.
library(readxl)
my_df2 <- read_excel("list.xlsx",sheet = "df2", range = "A1:A3")
my_df2
## # A tibble: 2 × 1
## word2
## <chr>
## 1 three
## 2 <NA>
#6.1 ###1) Using ahp dataset, Create a new tibble named some_apr that contains all houses that are built before year 2000 (not including 2000), sold on or after year 2009, and with 2 or 3 bedrooms.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(r02pro)
ahp
## # A tibble: 2,048 × 56
## dt_sold yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2010-03-25 2010 3 1976 2005 60 1Fam
## 2 2009-04-10 2009 4 1996 1997 60 1Fam
## 3 2010-01-15 2010 1 1953 2007 20 1Fam
## 4 2010-04-19 2010 4 2006 2007 60 1Fam
## 5 2010-03-22 2010 3 1900 1993 50 1Fam
## 6 2010-06-06 2010 6 1966 2002 20 1Fam
## 7 2006-06-14 2006 6 2005 2006 180 TwnhsE
## 8 2010-05-08 2010 5 1959 1959 20 1Fam
## 9 2007-06-14 2007 6 1952 1952 20 1Fam
## 10 2007-09-01 2007 9 1969 1969 60 1Fam
## # … with 2,038 more rows, and 49 more variables: house_style <chr>,
## # zoning <chr>, neighborhd <chr>, oa_cond <dbl>, oa_qual <dbl>, func <chr>,
## # liv_area <dbl>, 1fl_area <dbl>, 2fl_area <dbl>, tot_rms <dbl>,
## # bedroom <dbl>, bathroom <dbl>, kit <dbl>, kit_qual <chr>,
## # central_air <chr>, elect <chr>, bsmt_area <dbl>, bsmt_cond <chr>,
## # bsmt_exp <chr>, bsmt_fin_qual <chr>, bsmt_ht <chr>, ext_cond <chr>,
## # ext_cover <chr>, ext_qual <chr>, fdn <chr>, fence <chr>, fp <dbl>, …
some_apr <- tibble(filter(ahp, yr_built < 2000, yr_sold >= 2009, bedroom == c(2,3)))
some_apr
## # A tibble: 201 × 56
## dt_sold yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2010-05-08 2010 5 1959 1959 20 1Fam
## 2 2009-06-28 2009 6 1963 1963 20 1Fam
## 3 2010-03-02 2010 3 1992 2007 20 1Fam
## 4 2009-04-24 2009 4 1961 1961 20 1Fam
## 5 2010-01-16 2010 1 1977 1977 20 1Fam
## 6 2010-03-14 2010 3 1970 1970 160 Twnhs
## 7 2009-07-20 2009 7 1971 2006 20 1Fam
## 8 2010-02-19 2010 2 1962 1962 20 1Fam
## 9 2009-06-05 2009 6 1947 1950 20 1Fam
## 10 2009-05-19 2009 5 1978 1998 20 1Fam
## # … with 191 more rows, and 49 more variables: house_style <chr>, zoning <chr>,
## # neighborhd <chr>, oa_cond <dbl>, oa_qual <dbl>, func <chr>, liv_area <dbl>,
## # 1fl_area <dbl>, 2fl_area <dbl>, tot_rms <dbl>, bedroom <dbl>,
## # bathroom <dbl>, kit <dbl>, kit_qual <chr>, central_air <chr>, elect <chr>,
## # bsmt_area <dbl>, bsmt_cond <chr>, bsmt_exp <chr>, bsmt_fin_qual <chr>,
## # bsmt_ht <chr>, ext_cond <chr>, ext_cover <chr>, ext_qual <chr>, fdn <chr>,
## # fence <chr>, fp <dbl>, fp_qual <chr>, gar_area <dbl>, gar_car <dbl>, …
#6.2
###1) Using the ahp dataset, Find all houses built in 2008 with house style as 2Story, then arrange the the observations in the ascending order of remodel year, and break the ties in the descending order of sale price.
library(dplyr)
df2008 <- filter(ahp, yr_built == 2008 , house_style == "2Story")
df2008
## # A tibble: 8 × 56
## dt_sold yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2009-04-13 2009 4 2008 2008 60 1Fam
## 2 2008-11-16 2008 11 2008 2008 60 1Fam
## 3 2009-06-19 2009 6 2008 2008 60 1Fam
## 4 2009-06-17 2009 6 2008 2008 60 1Fam
## 5 2008-01-27 2008 1 2008 2008 60 1Fam
## 6 2009-07-11 2009 7 2008 2008 60 1Fam
## 7 2009-07-16 2009 7 2008 2009 60 1Fam
## 8 2009-01-07 2009 1 2008 2009 60 1Fam
## # … with 49 more variables: house_style <chr>, zoning <chr>, neighborhd <chr>,
## # oa_cond <dbl>, oa_qual <dbl>, func <chr>, liv_area <dbl>, 1fl_area <dbl>,
## # 2fl_area <dbl>, tot_rms <dbl>, bedroom <dbl>, bathroom <dbl>, kit <dbl>,
## # kit_qual <chr>, central_air <chr>, elect <chr>, bsmt_area <dbl>,
## # bsmt_cond <chr>, bsmt_exp <chr>, bsmt_fin_qual <chr>, bsmt_ht <chr>,
## # ext_cond <chr>, ext_cover <chr>, ext_qual <chr>, fdn <chr>, fence <chr>,
## # fp <dbl>, fp_qual <chr>, gar_area <dbl>, gar_car <dbl>, gar_cond <chr>, …
arrange(df2008, yr_remodel, desc(sale_price))
## # A tibble: 8 × 56
## dt_sold yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2009-06-19 2009 6 2008 2008 60 1Fam
## 2 2008-11-16 2008 11 2008 2008 60 1Fam
## 3 2009-07-11 2009 7 2008 2008 60 1Fam
## 4 2009-04-13 2009 4 2008 2008 60 1Fam
## 5 2009-06-17 2009 6 2008 2008 60 1Fam
## 6 2008-01-27 2008 1 2008 2008 60 1Fam
## 7 2009-01-07 2009 1 2008 2009 60 1Fam
## 8 2009-07-16 2009 7 2008 2009 60 1Fam
## # … with 49 more variables: house_style <chr>, zoning <chr>, neighborhd <chr>,
## # oa_cond <dbl>, oa_qual <dbl>, func <chr>, liv_area <dbl>, 1fl_area <dbl>,
## # 2fl_area <dbl>, tot_rms <dbl>, bedroom <dbl>, bathroom <dbl>, kit <dbl>,
## # kit_qual <chr>, central_air <chr>, elect <chr>, bsmt_area <dbl>,
## # bsmt_cond <chr>, bsmt_exp <chr>, bsmt_fin_qual <chr>, bsmt_ht <chr>,
## # ext_cond <chr>, ext_cover <chr>, ext_qual <chr>, fdn <chr>, fence <chr>,
## # fp <dbl>, fp_qual <chr>, gar_area <dbl>, gar_car <dbl>, gar_cond <chr>, …
#6.3
###1) Find all houses built in 2006 and sold in January or July, then arrange the data in the ascending order of sold year, and finally create a bar chart to represent the distribution of kitchen quality. Combine all operations by using the pipe operator.
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.5 ✓ stringr 1.4.0
## ✓ tidyr 1.1.4 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(r02pro)
ahp %>%
filter(yr_built == 2006, mo_sold == c(1,7)) %>%
arrange(yr_sold) %>%
ggplot(mapping = aes(x=kit_qual)) + geom_bar()
#6.4 ###1) Select all variables with names ends with qual or area, then give the first three variables the new names a, b and c respectively. Then assign the new dataset to a name Qual.
library(tidyverse)
Qual <- ahp %>%
select(ends_with(c("qual","area"))) %>%
rename(a=oa_qual, b=kit_qual, c=bsmt_fin_qual)
Qual
## # A tibble: 2,048 × 16
## a b c ext_qual fp_qual gar_qual heat_qual liv_area `1fl_area`
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 6 Good GLQ Good <NA> Average Excellent 1479 725
## 2 7 Good GLQ Good Average Average Excellent 2122 913
## 3 5 Good GLQ Average <NA> Average Average 1057 1057
## 4 5 Average Unf Average <NA> Average Good 1444 744
## 5 6 Average Unf Average <NA> Average Average 1445 831
## 6 6 Good ALQ Good Good Average Excellent 1888 1888
## 7 6 Average GLQ Average <NA> Average Good 1072 1072
## 8 5 Average Rec Average <NA> Average Fair 1188 1188
## 9 5 Average Rec Average <NA> Average Average 924 924
## 10 5 Fair Rec Average Average Average Average 2080 1040
## # … with 2,038 more rows, and 7 more variables: 2fl_area <dbl>,
## # bsmt_area <dbl>, gar_area <dbl>, lot_area <dbl>, op_area <dbl>,
## # ep_area <dbl>, wd_area <dbl>
###2) In Qual, move liv_area to be the first column and heat_qual to be the second column while keeping the same order for the remaining columns.
Qual %>% select(liv_area, heat_qual, everything())
## # A tibble: 2,048 × 16
## liv_area heat_qual a b c ext_qual fp_qual gar_qual `1fl_area`
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 1479 Excellent 6 Good GLQ Good <NA> Average 725
## 2 2122 Excellent 7 Good GLQ Good Average Average 913
## 3 1057 Average 5 Good GLQ Average <NA> Average 1057
## 4 1444 Good 5 Average Unf Average <NA> Average 744
## 5 1445 Average 6 Average Unf Average <NA> Average 831
## 6 1888 Excellent 6 Good ALQ Good Good Average 1888
## 7 1072 Good 6 Average GLQ Average <NA> Average 1072
## 8 1188 Fair 5 Average Rec Average <NA> Average 1188
## 9 924 Average 5 Average Rec Average <NA> Average 924
## 10 2080 Average 5 Fair Rec Average Average Average 1040
## # … with 2,038 more rows, and 7 more variables: 2fl_area <dbl>,
## # bsmt_area <dbl>, gar_area <dbl>, lot_area <dbl>, op_area <dbl>,
## # ep_area <dbl>, wd_area <dbl>
#6.5
Create a new variables named age being the age of the house when it was sold in terms of years (the number of years from when the house was built to when the house was sold). Then, select the variables age, sale_price, and kit_qual. Finally, generate a scatterplot between age (x-axis) and sale_price (y-axis), with different colors representing different kit_qual. Explain the findings from the figure.
ahp %>%
mutate(age = yr_sold - yr_built) %>%
ggplot() +
geom_point(mapping = aes(x = age, y = sale_price, color = kit_qual ))
## Warning: Removed 5 rows containing missing values (geom_point).
#6.6 ###1) For each month when the house was sold, summarize the 1st and 3rd quantile of the sale price. Then, create a scatterplot between the month (x-axis) and the quantile of the sale price with different colors for 1st and 3rd quantile. Explain the findings from the figure.
ahp %>%
group_by(mo_sold) %>%
summarize( prob = c(0.25, 0.75), q_price = quantile(sale_price, c(0.25, 0.75), na.rm = TRUE)) %>%
ggplot(mapping = aes(x = mo_sold, y = q_price, color = factor(prob))) +
geom_point()
## `summarise()` has grouped output by 'mo_sold'. You can override using the `.groups` argument.
###2) Someone has a conjecture that that the houses that are less than 30 years old have a higher sale price than the houses that are more than 30 years old. Show whether this is true in terms of maximum price, median price, and the minimum price for the houses in each group.
ahp %>%
group_by(yr_built) %>%
summarize(min_price=min(sale_price), max_price=max(sale_price), median_price=median(sale_price)) %>%
ggplot() + geom_point(mapping=aes(x=yr_built, y=min_price, color = yr_built > 30))
## Warning: Removed 5 rows containing missing values (geom_point).