#5.1 Use R to create the following data frame and assign it to the name my_data.

#library(tidyverse)
word <- c("one", "two", NA, "four", "five")
number <- c(1, NA, 3, 4, 5)
letter <- c(letters[1:5])
my_data <- data.frame(word, number, letter)
my_data
##   word number letter
## 1  one      1      a
## 2  two     NA      b
## 3 <NA>      3      c
## 4 four      4      d
## 5 five      5      e

Q1

Write R code to set working directory to desktop and write my_data into a .csv file named “my_data_no_name.csv” without column names.

#setwd("~/Desktop/21fall/2383 R/hw")
library(readr)
write_csv(my_data, "my_data_no_name.csv", col_names = FALSE)

Q2

Write R code to write my_data into a delimited file called “my_data_na.csv” with # as the delimiter and use 999 as the indicator for missing values.

write_delim(my_data, "my_data_na.csv", delim = "#" ,na = "999")

5.2

###1) For the “my_data_na.csv” file you created in Exercise 2 in Section 5.1, write R code to read the file into an object with name my_data.

library(readr)
my_data <- read.csv("my_data_na.csv")
my_data
##   word.number.letter
## 1            one#1#a
## 2          two#999#b
## 3            999#3#c
## 4           four#4#d
## 5           five#5#e

2)

Which of the following are the column names of the d1?

d1 <- read_csv("x,y,z
          1,3,5
          2,4,6", col_names = FALSE)
## Rows: 3 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): X1, X2, X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d1
## # A tibble: 3 × 3
##   X1    X2    X3   
##   <chr> <chr> <chr>
## 1 x     y     z    
## 2 1     3     5    
## 3 2     4     6

We can see that the column names of the d1 is X1, X2, and X3.

3)

What will be the column name(s) of the d1?

d1 <- read_csv("The first line 
          The second line
          The third line
          x,y,z
          1,3,5")
## Rows: 4 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): The first line
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d1
## Warning: One or more parsing issues, see `problems()` for details
## # A tibble: 4 × 1
##   `The first line`
##   <chr>           
## 1 The second line 
## 2 The third line  
## 3 x,y,z           
## 4 1,3,5

The column name of the d1 is “The first line”.

5.3

Use R to create the following data frame and assign it to the name df1.

word1 <- c("one", "two", NA)
number1 <- c(1,NA,3)
df1 <- data.frame(word1, number1)
df1
##   word1 number1
## 1   one       1
## 2   two      NA
## 3  <NA>       3
word2 <- c("three", NA, "five")
number2 <- c(3,4,5)
df2 <- data.frame(word2, number2)
df2
##   word2 number2
## 1 three       3
## 2  <NA>       4
## 3  five       5

Then create a list named my_list with df1 as the first element and df2 as the second element.

library(r02pro)
my_list <- list(df1=df1, df2=df2)
my_list
## $df1
##   word1 number1
## 1   one       1
## 2   two      NA
## 3  <NA>       3
## 
## $df2
##   word2 number2
## 1 three       3
## 2  <NA>       4
## 3  five       5

###1) Write R code to set working directory to the desktop, then export my_list into an excel file named list.xlsx. How many sheets are there in the excel file? What’s in each sheet?

setwd("~/Desktop/21fall/2383 R/hw")
library(writexl)
write_xlsx(my_list, "list.xlsx")

From the result, we can see that there are 2 sheets in the excel, and df1 data is in the sheet1, df2 data is in the shee2.

###2) Write R code to import the first two rows and the first column of the second sheet from the excel file you just created.

library(readxl)
my_df2 <- read_excel("list.xlsx",sheet = "df2", range = "A1:A3")
my_df2
## # A tibble: 2 × 1
##   word2
##   <chr>
## 1 three
## 2 <NA>

#6.1 ###1) Using ahp dataset, Create a new tibble named some_apr that contains all houses that are built before year 2000 (not including 2000), sold on or after year 2009, and with 2 or 3 bedrooms.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(r02pro)
ahp
## # A tibble: 2,048 × 56
##    dt_sold    yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
##    <date>       <dbl>   <dbl>    <dbl>      <dbl>      <dbl> <chr>    
##  1 2010-03-25    2010       3     1976       2005         60 1Fam     
##  2 2009-04-10    2009       4     1996       1997         60 1Fam     
##  3 2010-01-15    2010       1     1953       2007         20 1Fam     
##  4 2010-04-19    2010       4     2006       2007         60 1Fam     
##  5 2010-03-22    2010       3     1900       1993         50 1Fam     
##  6 2010-06-06    2010       6     1966       2002         20 1Fam     
##  7 2006-06-14    2006       6     2005       2006        180 TwnhsE   
##  8 2010-05-08    2010       5     1959       1959         20 1Fam     
##  9 2007-06-14    2007       6     1952       1952         20 1Fam     
## 10 2007-09-01    2007       9     1969       1969         60 1Fam     
## # … with 2,038 more rows, and 49 more variables: house_style <chr>,
## #   zoning <chr>, neighborhd <chr>, oa_cond <dbl>, oa_qual <dbl>, func <chr>,
## #   liv_area <dbl>, 1fl_area <dbl>, 2fl_area <dbl>, tot_rms <dbl>,
## #   bedroom <dbl>, bathroom <dbl>, kit <dbl>, kit_qual <chr>,
## #   central_air <chr>, elect <chr>, bsmt_area <dbl>, bsmt_cond <chr>,
## #   bsmt_exp <chr>, bsmt_fin_qual <chr>, bsmt_ht <chr>, ext_cond <chr>,
## #   ext_cover <chr>, ext_qual <chr>, fdn <chr>, fence <chr>, fp <dbl>, …
some_apr <- tibble(filter(ahp, yr_built < 2000, yr_sold >= 2009, bedroom == c(2,3)))
some_apr
## # A tibble: 201 × 56
##    dt_sold    yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
##    <date>       <dbl>   <dbl>    <dbl>      <dbl>      <dbl> <chr>    
##  1 2010-05-08    2010       5     1959       1959         20 1Fam     
##  2 2009-06-28    2009       6     1963       1963         20 1Fam     
##  3 2010-03-02    2010       3     1992       2007         20 1Fam     
##  4 2009-04-24    2009       4     1961       1961         20 1Fam     
##  5 2010-01-16    2010       1     1977       1977         20 1Fam     
##  6 2010-03-14    2010       3     1970       1970        160 Twnhs    
##  7 2009-07-20    2009       7     1971       2006         20 1Fam     
##  8 2010-02-19    2010       2     1962       1962         20 1Fam     
##  9 2009-06-05    2009       6     1947       1950         20 1Fam     
## 10 2009-05-19    2009       5     1978       1998         20 1Fam     
## # … with 191 more rows, and 49 more variables: house_style <chr>, zoning <chr>,
## #   neighborhd <chr>, oa_cond <dbl>, oa_qual <dbl>, func <chr>, liv_area <dbl>,
## #   1fl_area <dbl>, 2fl_area <dbl>, tot_rms <dbl>, bedroom <dbl>,
## #   bathroom <dbl>, kit <dbl>, kit_qual <chr>, central_air <chr>, elect <chr>,
## #   bsmt_area <dbl>, bsmt_cond <chr>, bsmt_exp <chr>, bsmt_fin_qual <chr>,
## #   bsmt_ht <chr>, ext_cond <chr>, ext_cover <chr>, ext_qual <chr>, fdn <chr>,
## #   fence <chr>, fp <dbl>, fp_qual <chr>, gar_area <dbl>, gar_car <dbl>, …

#6.2

###1) Using the ahp dataset, Find all houses built in 2008 with house style as 2Story, then arrange the the observations in the ascending order of remodel year, and break the ties in the descending order of sale price.

library(dplyr)
df2008 <- filter(ahp, yr_built == 2008 , house_style == "2Story")
df2008
## # A tibble: 8 × 56
##   dt_sold    yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
##   <date>       <dbl>   <dbl>    <dbl>      <dbl>      <dbl> <chr>    
## 1 2009-04-13    2009       4     2008       2008         60 1Fam     
## 2 2008-11-16    2008      11     2008       2008         60 1Fam     
## 3 2009-06-19    2009       6     2008       2008         60 1Fam     
## 4 2009-06-17    2009       6     2008       2008         60 1Fam     
## 5 2008-01-27    2008       1     2008       2008         60 1Fam     
## 6 2009-07-11    2009       7     2008       2008         60 1Fam     
## 7 2009-07-16    2009       7     2008       2009         60 1Fam     
## 8 2009-01-07    2009       1     2008       2009         60 1Fam     
## # … with 49 more variables: house_style <chr>, zoning <chr>, neighborhd <chr>,
## #   oa_cond <dbl>, oa_qual <dbl>, func <chr>, liv_area <dbl>, 1fl_area <dbl>,
## #   2fl_area <dbl>, tot_rms <dbl>, bedroom <dbl>, bathroom <dbl>, kit <dbl>,
## #   kit_qual <chr>, central_air <chr>, elect <chr>, bsmt_area <dbl>,
## #   bsmt_cond <chr>, bsmt_exp <chr>, bsmt_fin_qual <chr>, bsmt_ht <chr>,
## #   ext_cond <chr>, ext_cover <chr>, ext_qual <chr>, fdn <chr>, fence <chr>,
## #   fp <dbl>, fp_qual <chr>, gar_area <dbl>, gar_car <dbl>, gar_cond <chr>, …
arrange(df2008, yr_remodel, desc(sale_price))
## # A tibble: 8 × 56
##   dt_sold    yr_sold mo_sold yr_built yr_remodel bldg_class bldg_type
##   <date>       <dbl>   <dbl>    <dbl>      <dbl>      <dbl> <chr>    
## 1 2009-06-19    2009       6     2008       2008         60 1Fam     
## 2 2008-11-16    2008      11     2008       2008         60 1Fam     
## 3 2009-07-11    2009       7     2008       2008         60 1Fam     
## 4 2009-04-13    2009       4     2008       2008         60 1Fam     
## 5 2009-06-17    2009       6     2008       2008         60 1Fam     
## 6 2008-01-27    2008       1     2008       2008         60 1Fam     
## 7 2009-01-07    2009       1     2008       2009         60 1Fam     
## 8 2009-07-16    2009       7     2008       2009         60 1Fam     
## # … with 49 more variables: house_style <chr>, zoning <chr>, neighborhd <chr>,
## #   oa_cond <dbl>, oa_qual <dbl>, func <chr>, liv_area <dbl>, 1fl_area <dbl>,
## #   2fl_area <dbl>, tot_rms <dbl>, bedroom <dbl>, bathroom <dbl>, kit <dbl>,
## #   kit_qual <chr>, central_air <chr>, elect <chr>, bsmt_area <dbl>,
## #   bsmt_cond <chr>, bsmt_exp <chr>, bsmt_fin_qual <chr>, bsmt_ht <chr>,
## #   ext_cond <chr>, ext_cover <chr>, ext_qual <chr>, fdn <chr>, fence <chr>,
## #   fp <dbl>, fp_qual <chr>, gar_area <dbl>, gar_car <dbl>, gar_cond <chr>, …

#6.3

###1) Find all houses built in 2006 and sold in January or July, then arrange the data in the ascending order of sold year, and finally create a bar chart to represent the distribution of kitchen quality. Combine all operations by using the pipe operator.

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.5     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(r02pro)
ahp %>%
  filter(yr_built == 2006, mo_sold == c(1,7)) %>%
  arrange(yr_sold) %>%
  ggplot(mapping = aes(x=kit_qual)) + geom_bar()

#6.4 ###1) Select all variables with names ends with qual or area, then give the first three variables the new names a, b and c respectively. Then assign the new dataset to a name Qual.

library(tidyverse)
Qual <- ahp %>% 
          select(ends_with(c("qual","area"))) %>% 
          rename(a=oa_qual, b=kit_qual, c=bsmt_fin_qual)
Qual
## # A tibble: 2,048 × 16
##        a b       c     ext_qual fp_qual gar_qual heat_qual liv_area `1fl_area`
##    <dbl> <chr>   <chr> <chr>    <chr>   <chr>    <chr>        <dbl>      <dbl>
##  1     6 Good    GLQ   Good     <NA>    Average  Excellent     1479        725
##  2     7 Good    GLQ   Good     Average Average  Excellent     2122        913
##  3     5 Good    GLQ   Average  <NA>    Average  Average       1057       1057
##  4     5 Average Unf   Average  <NA>    Average  Good          1444        744
##  5     6 Average Unf   Average  <NA>    Average  Average       1445        831
##  6     6 Good    ALQ   Good     Good    Average  Excellent     1888       1888
##  7     6 Average GLQ   Average  <NA>    Average  Good          1072       1072
##  8     5 Average Rec   Average  <NA>    Average  Fair          1188       1188
##  9     5 Average Rec   Average  <NA>    Average  Average        924        924
## 10     5 Fair    Rec   Average  Average Average  Average       2080       1040
## # … with 2,038 more rows, and 7 more variables: 2fl_area <dbl>,
## #   bsmt_area <dbl>, gar_area <dbl>, lot_area <dbl>, op_area <dbl>,
## #   ep_area <dbl>, wd_area <dbl>

###2) In Qual, move liv_area to be the first column and heat_qual to be the second column while keeping the same order for the remaining columns.

Qual %>% select(liv_area, heat_qual, everything())
## # A tibble: 2,048 × 16
##    liv_area heat_qual     a b       c     ext_qual fp_qual gar_qual `1fl_area`
##       <dbl> <chr>     <dbl> <chr>   <chr> <chr>    <chr>   <chr>         <dbl>
##  1     1479 Excellent     6 Good    GLQ   Good     <NA>    Average         725
##  2     2122 Excellent     7 Good    GLQ   Good     Average Average         913
##  3     1057 Average       5 Good    GLQ   Average  <NA>    Average        1057
##  4     1444 Good          5 Average Unf   Average  <NA>    Average         744
##  5     1445 Average       6 Average Unf   Average  <NA>    Average         831
##  6     1888 Excellent     6 Good    ALQ   Good     Good    Average        1888
##  7     1072 Good          6 Average GLQ   Average  <NA>    Average        1072
##  8     1188 Fair          5 Average Rec   Average  <NA>    Average        1188
##  9      924 Average       5 Average Rec   Average  <NA>    Average         924
## 10     2080 Average       5 Fair    Rec   Average  Average Average        1040
## # … with 2,038 more rows, and 7 more variables: 2fl_area <dbl>,
## #   bsmt_area <dbl>, gar_area <dbl>, lot_area <dbl>, op_area <dbl>,
## #   ep_area <dbl>, wd_area <dbl>

#6.5

Create a new variables named age being the age of the house when it was sold in terms of years (the number of years from when the house was built to when the house was sold). Then, select the variables age, sale_price, and kit_qual. Finally, generate a scatterplot between age (x-axis) and sale_price (y-axis), with different colors representing different kit_qual. Explain the findings from the figure.

ahp %>%
  mutate(age = yr_sold - yr_built) %>%
  ggplot() +
  geom_point(mapping = aes(x = age, y = sale_price, color = kit_qual ))
## Warning: Removed 5 rows containing missing values (geom_point).

#6.6 ###1) For each month when the house was sold, summarize the 1st and 3rd quantile of the sale price. Then, create a scatterplot between the month (x-axis) and the quantile of the sale price with different colors for 1st and 3rd quantile. Explain the findings from the figure.

ahp %>% 
    group_by(mo_sold) %>%
  summarize( prob = c(0.25, 0.75), q_price = quantile(sale_price, c(0.25, 0.75), na.rm = TRUE)) %>%
  ggplot(mapping = aes(x = mo_sold, y = q_price, color = factor(prob))) +
  geom_point() 
## `summarise()` has grouped output by 'mo_sold'. You can override using the `.groups` argument.

###2) Someone has a conjecture that that the houses that are less than 30 years old have a higher sale price than the houses that are more than 30 years old. Show whether this is true in terms of maximum price, median price, and the minimum price for the houses in each group.

ahp %>% 
  group_by(yr_built) %>%
  summarize(min_price=min(sale_price), max_price=max(sale_price), median_price=median(sale_price)) %>%
  ggplot() + geom_point(mapping=aes(x=yr_built, y=min_price, color = yr_built > 30))
## Warning: Removed 5 rows containing missing values (geom_point).