Discussion week2

Author

Nuobing Fan

Data Sets

Note we only need to install packages once

#library(datasets)
#?datasets()
#?AER
#install.packages("AER")
#install.packages("MASS")
#install.packages("https://cran.r-project.org/src/contrib/Archive/MASS/MASS_7.3-51.6.tar.gz", repos = NULL, type="source")

library(AER)
Warning: package 'AER' was built under R version 4.3.3
Loading required package: car
Warning: package 'car' was built under R version 4.3.3
Loading required package: carData
Warning: package 'carData' was built under R version 4.3.3
Loading required package: lmtest
Warning: package 'lmtest' was built under R version 4.3.3
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
Loading required package: sandwich
Warning: package 'sandwich' was built under R version 4.3.3
Loading required package: survival
library(MASS)
??AER
??MASS

Set1(d1) and Set2(d2)

#Set1
data("CASchools")
#Set2
data("Boston")

test

head(CASchools)
  district                          school  county grades students teachers
1    75119              Sunol Glen Unified Alameda  KK-08      195    10.90
2    61499            Manzanita Elementary   Butte  KK-08      240    11.15
3    61549     Thermalito Union Elementary   Butte  KK-08     1550    82.90
4    61457 Golden Feather Union Elementary   Butte  KK-08      243    14.00
5    61523        Palermo Union Elementary   Butte  KK-08     1335    71.50
6    62042         Burrel Union Elementary  Fresno  KK-08      137     6.40
  calworks   lunch computer expenditure    income   english  read  math
1   0.5102  2.0408       67    6384.911 22.690001  0.000000 691.6 690.0
2  15.4167 47.9167      101    5099.381  9.824000  4.583333 660.5 661.9
3  55.0323 76.3226      169    5501.955  8.978000 30.000002 636.3 650.9
4  36.4754 77.0492       85    7101.831  8.978000  0.000000 651.9 643.5
5  33.1086 78.4270      171    5235.988  9.080333 13.857677 641.8 639.9
6  12.3188 86.9565       25    5580.147 10.415000 12.408759 605.7 605.4
head(Boston)
     crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
  medv
1 24.0
2 21.6
3 34.7
4 33.4
5 36.2
6 28.7

Describe Data

Set1: CASchools (from the AER package) contains cross-sectional data on 420 California schools, with variables like student-teacher ratio, average test scores, and per-student expenditure. These variables allow people to explore how resource allocation impacts student outcomes.

Set2: Boston (from the MASS package) contains data on housing values in suburbs of Boston, with key variables like the crime rate, average number of rooms, and the proportion of the population considered lower status. This data helps study the factors affecting housing prices.

Type of Data

CASchools: This dataset contains observations on different schools (420 California schools) at a single point in time, making it cross-sectional. Each row represents data from a different school, and there is no tracking of these schools over time.

# Set1: Scatterplot for CASchools (expenditure per student vs. test score)
plot(CASchools$expenditure, CASchools$testscr, 
     main = "Expenditure per Student vs. Test Scores",
     xlab = "Expenditure per Student", 
     ylab = "Average Test Score",
     pch = 19, col = "blue")
grid()

Boston: Similarly, the Boston dataset provides information about housing in different suburbs of Boston at a particular point in time. Since it includes data from various suburbs without tracking them across multiple periods, it is also considered cross-sectional data.

# Set2: Scatterplot for Boston (number of rooms vs. median home value)
plot(Boston$rm, Boston$medv, 
     main = "Number of Rooms vs. Median Home Value",
     xlab = "Number of Rooms",
     ylab = "Median Home Value ($1000s)",
     pch = 19, col = "red")
grid()

Summary for both sets

summary(CASchools)
   district            school                  county      grades   
 Length:420         Length:420         Sonoma     : 29   KK-06: 61  
 Class :character   Class :character   Kern       : 27   KK-08:359  
 Mode  :character   Mode  :character   Los Angeles: 27              
                                       Tulare     : 24              
                                       San Diego  : 21              
                                       Santa Clara: 20              
                                       (Other)    :272              
    students          teachers          calworks          lunch       
 Min.   :   81.0   Min.   :   4.85   Min.   : 0.000   Min.   :  0.00  
 1st Qu.:  379.0   1st Qu.:  19.66   1st Qu.: 4.395   1st Qu.: 23.28  
 Median :  950.5   Median :  48.56   Median :10.520   Median : 41.75  
 Mean   : 2628.8   Mean   : 129.07   Mean   :13.246   Mean   : 44.71  
 3rd Qu.: 3008.0   3rd Qu.: 146.35   3rd Qu.:18.981   3rd Qu.: 66.86  
 Max.   :27176.0   Max.   :1429.00   Max.   :78.994   Max.   :100.00  
                                                                      
    computer       expenditure       income          english      
 Min.   :   0.0   Min.   :3926   Min.   : 5.335   Min.   : 0.000  
 1st Qu.:  46.0   1st Qu.:4906   1st Qu.:10.639   1st Qu.: 1.941  
 Median : 117.5   Median :5215   Median :13.728   Median : 8.778  
 Mean   : 303.4   Mean   :5312   Mean   :15.317   Mean   :15.768  
 3rd Qu.: 375.2   3rd Qu.:5601   3rd Qu.:17.629   3rd Qu.:22.970  
 Max.   :3324.0   Max.   :7712   Max.   :55.328   Max.   :85.540  
                                                                  
      read            math      
 Min.   :604.5   Min.   :605.4  
 1st Qu.:640.4   1st Qu.:639.4  
 Median :655.8   Median :652.5  
 Mean   :655.0   Mean   :653.3  
 3rd Qu.:668.7   3rd Qu.:665.9  
 Max.   :704.0   Max.   :709.5  
                                
summary(Boston)
      crim                zn             indus            chas        
 Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
 1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
 Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
 Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
 3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
 Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
      nox               rm             age              dis        
 Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
 1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
 Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
 Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
 3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
 Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
      rad              tax           ptratio          black       
 Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
 1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
 Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
 Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
 3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
 Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
     lstat            medv      
 Min.   : 1.73   Min.   : 5.00  
 1st Qu.: 6.95   1st Qu.:17.02  
 Median :11.36   Median :21.20  
 Mean   :12.65   Mean   :22.53  
 3rd Qu.:16.95   3rd Qu.:25.00  
 Max.   :37.97   Max.   :50.00