1 Cross Section Data

1.1 Infidelity data

Variables Description

# install.packages("AER")
library("AER")
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
data("Affairs")

str(Affairs)
## 'data.frame':    601 obs. of  9 variables:
##  $ affairs      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gender       : Factor w/ 2 levels "female","male": 2 1 1 2 2 1 1 2 1 2 ...
##  $ age          : num  37 27 32 57 22 32 22 57 32 22 ...
##  $ yearsmarried : num  10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
##  $ children     : Factor w/ 2 levels "no","yes": 1 1 2 2 1 1 1 2 2 1 ...
##  $ religiousness: int  3 4 1 5 2 2 2 2 4 4 ...
##  $ education    : num  18 14 12 18 17 17 12 14 16 14 ...
##  $ occupation   : int  7 6 1 6 6 5 1 4 1 4 ...
##  $ rating       : int  4 4 4 5 3 5 3 4 2 5 ...
head(Affairs)
##    affairs gender age yearsmarried children religiousness education occupation
## 4        0   male  37        10.00       no             3        18          7
## 5        0 female  27         4.00       no             4        14          6
## 11       0 female  32        15.00      yes             1        12          1
## 16       0   male  57        15.00      yes             5        18          6
## 23       0   male  22         0.75       no             2        17          6
## 29       0 female  32         1.50       no             2        17          5
##    rating
## 4       4
## 5       4
## 11      4
## 16      5
## 23      3
## 29      5
tail(Affairs[,-c(5:9)])
##      affairs gender age yearsmarried
## 1935       7   male  47         15.0
## 1938       1   male  22          1.5
## 1941       7 female  32         10.0
## 1954       2   male  32         10.0
## 1959       2   male  22          7.0
## 9010       1 female  32         15.0
## Greene (2003)
## Tab. 22.3 and 22.4

1.1.1 Scatterplot

This scatterplot will show each data point as a dot, with age on the x-axis and the number of affairs on the y-axis.

# Create a scatterplot
plot(Affairs$age, Affairs$affairs, 
     xlab = "Age", 
     ylab = "Number of Affairs",
     main = "Scatterplot of Age vs. Number of Affairs"
     )

1.1.2 Barplot

This bar plot will show the frequency of affairs for each gender category.

# Create a bar plot of the frequency of affairs by gender
barplot(table(Affairs$gender),
        xlab = "Gender", 
        ylab = "Frequency",
        main = "Bar Plot of Frequency of Affairs by Gender")

# install.packages("texreg")
library(texreg)
## Version:  1.39.4
## Date:     2024-07-23
## Author:   Philip Leifeld (University of Manchester)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
# Simple Linear Regression models
age           <- lm(affairs ~ age          , data = Affairs)
children      <- lm(affairs ~ children     , data = Affairs)
religiousness <- lm(affairs ~ religiousness, data = Affairs)
yearsmarried  <- lm(affairs ~ yearsmarried , data = Affairs)
rating        <- lm(affairs ~ rating       , data = Affairs)

# Multiple Linear Regression
m1 <- lm(affairs ~ age + children + religiousness + yearsmarried + rating, data = Affairs)

# Creating a table of all models using screenreg
screenreg(list(age, children, religiousness, yearsmarried, rating, m1))
## 
## ===================================================================================
##                Model 1   Model 2     Model 3     Model 4     Model 5     Model 6   
## -----------------------------------------------------------------------------------
## (Intercept)      0.36      0.91 ***    2.73 ***    0.55 *      4.74 ***    5.99 ***
##                 (0.49)    (0.25)      (0.38)      (0.24)      (0.48)      (0.79)   
## age              0.03 *                                                   -0.04 *  
##                 (0.01)                                                    (0.02)   
## childrenyes                0.76 *                                         -0.20    
##                           (0.30)                                          (0.34)   
## religiousness                         -0.41 ***                           -0.49 ***
##                                       (0.11)                              (0.11)   
## yearsmarried                                       0.11 ***                0.17 ***
##                                                   (0.02)                  (0.04)   
## rating                                                        -0.84 ***   -0.71 ***
##                                                               (0.12)      (0.12)   
## -----------------------------------------------------------------------------------
## R^2              0.01      0.01        0.02        0.03        0.08        0.13    
## Adj. R^2         0.01      0.01        0.02        0.03        0.08        0.12    
## Num. obs.      601       601         601         601         601         601       
## ===================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

2 Time Series

2.1 Souvenir Sales Data

Monthly sales for a souvenir shop at a beach resort town in Queensland, Australia, for January 1987-December 1993.

Source

souvenir <- scan("http://robjhyndman.com/tsdldata/data/fancy.dat")

?plot
## Help on topic 'plot' was found in the following packages:
## 
##   Package               Library
##   base                  /Library/Frameworks/R.framework/Resources/library
##   graphics              /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## 
## 
## Using the first match ...
plot(souvenir, 
     xlab = "Time", 
     ylab = "Souvenir Sales", 
     type = "b"
     )

We can do better and plat the year values instead of data index values.

Let me now declare the data as time series.

souvenirtimeseries <- ts(souvenir, 
                         frequency = 12, 
                         start     = c(1987,1)
                         )
souvenirtimeseries
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 1987   1664.81   2397.53   2840.71   3547.29   3752.96   3714.74   4349.61
## 1988   2499.81   5198.24   7225.14   4806.03   5900.88   4951.34   6179.12
## 1989   4717.02   5702.63   9957.58   5304.78   6492.43   6630.80   7349.62
## 1990   5921.10   5814.58  12421.25   6369.77   7609.12   7224.75   8121.22
## 1991   4826.64   6470.23   9638.77   8821.17   8722.37  10209.48  11276.55
## 1992   7615.03   9849.69  14558.40  11587.33   9332.56  13082.09  16732.78
## 1993  10243.24  11266.88  21826.84  17357.33  15997.79  18601.53  26155.15
##            Aug       Sep       Oct       Nov       Dec
## 1987   3566.34   5021.82   6423.48   7600.60  19756.21
## 1988   4752.15   5496.43   5835.10  12600.08  28541.72
## 1989   8176.62   8573.17   9690.50  15151.84  34061.01
## 1990   7979.25   8093.06   8476.70  17914.66  30114.41
## 1991  12552.22  11637.39  13606.89  21822.11  45060.69
## 1992  19888.61  23933.38  25391.35  36024.80  80721.71
## 1993  28586.52  30505.41  30821.33  46634.38 104660.67
options(scipen = 200)
options(digits = 2)

Lets plot the data with ?plot.ts

?plot.ts
plot.ts(souvenirtimeseries,
        ylab = "Souvenir Sales",
        xlab = " Time (monthly)"
        )  

3 Pooled Cross Section

Pooled cross-sectional data typically involves data from multiple cross-sectional surveys conducted at different points in time, with the goal of combining them into a single dataset for analysis.

3.1 CPS78_85

Pooled Cross Section data for two years

  • N=1084 (550 and 534)

  • t=2 (1978 and 1985)

Note you have different people in CPS in year 1978 and 1985.

Source

# install.packages("wooldridge")
library("wooldridge")
data("cps78_85")
# summary(cps78_85)

str(cps78_85)
## 'data.frame':    1084 obs. of  15 variables:
##  $ educ    : int  12 12 6 12 12 8 11 15 16 15 ...
##  $ south   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nonwhite: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ female  : int  0 1 0 0 0 0 0 1 1 0 ...
##  $ married : int  0 1 1 1 1 1 0 0 0 1 ...
##  $ exper   : int  8 30 38 19 11 43 2 9 17 23 ...
##  $ expersq : int  64 900 1444 361 121 1849 4 81 289 529 ...
##  $ union   : int  0 1 1 1 0 0 0 0 0 1 ...
##  $ lwage   : num  1.22 1.61 2.14 2.07 1.65 ...
##  $ age     : int  25 47 49 36 28 56 18 29 38 43 ...
##  $ year    : int  78 78 78 78 78 78 78 78 78 78 ...
##  $ y85     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y85fem  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y85educ : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y85union: int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
help(cps78_85)

?table
table(cps78_85$y85)
## 
##   0   1 
## 550 534

You need to inspect the structure and characteristics of the dataset itself. Will have to read up on CPS website to confirm this is pooled cross section data.

4 Panel Data

4.1 Countries

7 countries (A-G) tracked from 1990 to 2000 (time=10 years) each.

The two way table shows the panel data is balanced.

coplot seems to be useful here as well and gives us 7 charts - one for each country for the 10 years.

Source

# install.packages("foriegn")

library(foreign)    
Panel <- read.dta(file = "http://dss.princeton.edu/training/Panel101.dta")
help(Panel)
## No documentation for 'Panel' in specified packages and libraries:
## you could try '??Panel'
 head(x = Panel, n = 11L)
##    country year           y y_bin     x1    x2     x3   opinion op
## 1        A 1990  1342787840     1  0.278 -1.11  0.283 Str agree  1
## 2        A 1991 -1899660544     0  0.321 -0.95  0.493     Disag  0
## 3        A 1992   -11234363     0  0.363 -0.79  0.703     Disag  0
## 4        A 1993  2645775360     1  0.246 -0.89 -0.094     Disag  0
## 5        A 1994  3008334848     1  0.425 -0.73  0.946     Disag  0
## 6        A 1995  3229574144     1  0.477 -0.72  1.030 Str agree  1
## 7        A 1996  2756754176     1  0.500 -0.78  1.092     Disag  0
## 8        A 1997  2771810560     1  0.052 -0.70  1.416 Str agree  1
## 9        A 1998  3397338880     1  0.366 -0.70  1.549     Disag  0
## 10       A 1999    39770336     1  0.396 -0.64  1.794 Str disag  0
## 11       B 1990 -5934699520     0 -0.082  1.43  0.023     Agree  1
 tail(x = Panel, n = 11L)
##    country year           y y_bin   x1    x2  x3   opinion op
## 60       F 1999  6349319168     1 0.28 -0.46 1.2     Disag  0
## 61       G 1990  1342787840     1 0.94 -1.52 1.5 Str disag  0
## 62       G 1991 -1518985728     0 1.10 -1.46 1.4     Agree  1
## 63       G 1992  1912769920     1 1.25 -1.41 1.4 Str agree  1
## 64       G 1993  1345690240     1 0.76 -1.35 1.9 Str disag  0
## 65       G 1994  2793515008     1 1.21 -1.33 2.2 Str disag  0
## 66       G 1995  1323696384     1 1.09 -1.41 2.8 Str disag  0
## 67       G 1996   254524176     1 0.78 -1.33 4.3 Str agree  1
## 68       G 1997  3297033216     1 1.26 -1.58 4.6     Disag  0
## 69       G 1998  3011820800     1 1.24 -1.60 6.1     Disag  0
## 70       G 1999  3296283392     1 1.23 -1.62 7.2     Disag  0
 # simplest way to 
table(Panel$year, Panel$country) 
##       
##        A B C D E F G
##   1990 1 1 1 1 1 1 1
##   1991 1 1 1 1 1 1 1
##   1992 1 1 1 1 1 1 1
##   1993 1 1 1 1 1 1 1
##   1994 1 1 1 1 1 1 1
##   1995 1 1 1 1 1 1 1
##   1996 1 1 1 1 1 1 1
##   1997 1 1 1 1 1 1 1
##   1998 1 1 1 1 1 1 1
##   1999 1 1 1 1 1 1 1
?coplot  # conditioning plots
coplot(data     =  Panel,
       formula  = y ~ year | country, 
       type     = "b"  # Points and lines
       ) 

scatterplot(y ~ year | country, 
            boxplots = FALSE, 
            smooth   = TRUE, 
            reg.line = FALSE, 
            data     = Panel
            )
## Warning in plot.window(...): "reg.line" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "reg.line" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "reg.line" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "reg.line" is not
## a graphical parameter
## Warning in box(...): "reg.line" is not a graphical parameter
## Warning in title(...): "reg.line" is not a graphical parameter

# fixed.dum <-lm(formula = y ~ x1 + factor(country) - 1, # no intercept but all dummies
#                data    = Panel
#                ) 
# 
# summary(fixed.dum)

5 Session Info

https://www.rostrum.blog/2018/10/13/sessioninfo/

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] foreign_0.8-87   wooldridge_1.4-3 texreg_1.39.4    AER_1.2-12      
##  [5] survival_3.7-0   sandwich_3.1-0   lmtest_0.9-40    zoo_1.8-12      
##  [9] car_3.1-2        carData_3.0-5   
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.7        cli_3.6.3         knitr_1.48        rlang_1.1.4      
##  [5] xfun_0.46         Formula_1.2-5     highr_0.11        jsonlite_1.8.8   
##  [9] htmltools_0.5.8.1 sass_0.4.9        rmarkdown_2.27    grid_4.4.1       
## [13] evaluate_0.24.0   jquerylib_0.1.4   abind_1.4-5       fastmap_1.2.0    
## [17] yaml_2.3.10       lifecycle_1.0.4   compiler_4.4.1    rstudioapi_0.16.0
## [21] lattice_0.22-6    digest_0.6.36     R6_2.5.1          splines_4.4.1    
## [25] bslib_0.8.0       Matrix_1.7-0      tools_4.4.1       cachem_1.1.0
# devtools::session_info()