DATA606 - Project Proposal

# loading the Original data from local file

old_data <- read.csv(file="csv_pny/ss09pny.csv", header=TRUE, sep=",")
head(old_data, 2)

##   RT SERIALNO SPORDER PUMA ST ADJINC PWGTP AGEP CIT CITWP COW DDRS DEAR
## 1  P        4       1  401 36 999480     9   79   4  1954  NA    2    2
## 2  P        4       2  401 36 999480    10   75   3    NA  NA    2    2
##   DEYE DOUT DPHY DRAT DRATX DREM ENG FER GCL GCM GCR HINS1 HINS2 HINS3
## 1    2    2    2   NA    NA    2   2  NA   2  NA  NA     2     1     1
## 2    2    2    2   NA    NA    2   1  NA   2  NA  NA     2     1     1
##   HINS4 HINS5 HINS6 HINS7 INTP JWMNP JWRIP JWTR LANX MAR MARHD MARHM MARHT
## 1     2     2     2     2    0    NA    NA   NA    1   1     2     2     1
## 2     2     2     2     2    0    NA    NA   NA    1   1     2     2     1
##   MARHW MARHYP MIG MIL MLPA MLPB MLPC MLPD MLPE MLPF MLPG MLPH MLPI MLPJ
## 1     2   1953   1   5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 2     2   1953   1   5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
##   MLPK NWAB NWAV NWLA NWLK NWRE OIP PAP REL RETP SCH SCHG SCHL SEMP SEX
## 1   NA    2    5    2    2    3   0   0   0    0   1   NA   15    0   1
## 2   NA    2    5    2    2    3   0   0   1    0   1   NA    1    0   2
##   SSIP  SSP WAGP WKHP WKL WKW WRK YOEP ANC ANC1P ANC2P DECADE DIS DRIVESP
## 1    0 9300    0   NA   3  NA   2 1947   1    21   999      1   2      NA
## 2    0 3600    0   NA   3  NA   2 1950   1    21   999      2   2      NA
##   ESP ESR FOD1P FOD2P HICOV HISP INDP JWAP JWDP LANP MIGPUMA MIGSP MSP
## 1  NA   6    NA    NA     1    1   NA   NA   NA  610      NA    NA   1
## 2  NA   6    NA    NA     1    1   NA   NA   NA  610      NA    NA   1
##   NAICSP NATIVITY NOP OC OCCP PAOC PERNP PINCP POBP POVPIP POWPUMA POWSP
## 1               2  NA  0   NA   NA     0  9300  126     99      NA    NA
## 2               1  NA  0   NA    4     0  3600  126     99      NA    NA
##   PRIVCOV PUBCOV QTRBIR RAC1P RAC2P RAC3P RACAIAN RACASN RACBLK RACNHPI
## 1       1      1      2     1     1    69       0      0      0       0
## 2       1      1      2     1     1    69       0      0      0       0
##   RACNUM RACSOR RACWHT RC SCIENGP SCIENGRLP SFN SFR SOCP VPS WAOB FAGEP
## 1      1      0      1  0      NA        NA  NA  NA       NA    5     0
## 2      1      0      1  0      NA        NA  NA  NA       NA    5     0
##   FANCP FCITP FCITWP FCOWP FDDRSP FDEARP FDEYEP FDOUTP FDPHYP FDRATP
## 1     0     0      0     0      0      0      0      0      0      0
## 2     0     0      0     0      0      0      0      0      0      0
##   FDRATXP FDREMP FENGP FESRP FFERP FFODP FGCLP FGCMP FGCRP FHINS1P FHINS2P
## 1       0      0     0     0     0     0     0     0     0       0       0
## 2       0      0     0     0     0     0     0     0     0       0       0
##   FHINS3C FHINS3P FHINS4C FHINS4P FHINS5C FHINS5P FHINS6P FHINS7P FHISP
## 1       0       0      NA       0      NA       0       0       0     0
## 2       0       0      NA       0      NA       0       0       0     0
##   FINDP FINTP FJWDP FJWMNP FJWRIP FJWTRP FLANP FLANXP FMARHDP FMARHMP
## 1     0     0     0      0      0      0     0      0       0       0
## 2     0     0     0      0      0      0     0      0       0       0
##   FMARHTP FMARHWP FMARHYP FMARP FMIGP FMIGSP FMILPP FMILSP FOCCP FOIP FPAP
## 1       0       0       0     0     0      0      0      0     0    0    0
## 2       0       0       0     0     0      0      0      0     0    0    0
##   FPOBP FPOWSP FRACP FRELP FRETP FSCHGP FSCHLP FSCHP FSEMP FSEXP FSSIP
## 1     0      0     0     0     0      0      0     0     0     0     0
## 2     0      0     0     0     0      0      0     0     0     0     0
##   FSSP FWAGP FWKHP FWKLP FWKWP FWRKP FYOEP pwgtp1 pwgtp2 pwgtp3 pwgtp4
## 1    0     0     0     0     0    NA     0     20      0     11      3
## 2    0     0     0     0     0    NA     0     16      1      8      3
##   pwgtp5 pwgtp6 pwgtp7 pwgtp8 pwgtp9 pwgtp10 pwgtp11 pwgtp12 pwgtp13
## 1      3     16     14      5      1      15      14      16      13
## 2      3     16     16      4      1      13      17      17      15
##   pwgtp14 pwgtp15 pwgtp16 pwgtp17 pwgtp18 pwgtp19 pwgtp20 pwgtp21 pwgtp22
## 1       9      15       8       7       2       2       4       2      23
## 2      11      17      10       9       2       2       5       4      27
##   pwgtp23 pwgtp24 pwgtp25 pwgtp26 pwgtp27 pwgtp28 pwgtp29 pwgtp30 pwgtp31
## 1       9      16      12       3       3       3      25       3       2
## 2       8      14      14       2       3       4      25       3       2
##   pwgtp32 pwgtp33 pwgtp34 pwgtp35 pwgtp36 pwgtp37 pwgtp38 pwgtp39 pwgtp40
## 1       4       2       8       4      11       8      12      17       4
## 2       3       3      10       3      10       8      14      18       3
##   pwgtp41 pwgtp42 pwgtp43 pwgtp44 pwgtp45 pwgtp46 pwgtp47 pwgtp48 pwgtp49
## 1      14       2       8       3       2      16      13       4       0
## 2      17       1       6       2       3      19      16       5       0
##   pwgtp50 pwgtp51 pwgtp52 pwgtp53 pwgtp54 pwgtp55 pwgtp56 pwgtp57 pwgtp58
## 1      16      16      18      16       8      19      10       9       3
## 2      18      14      17      19      10      18      10      10       3
##   pwgtp59 pwgtp60 pwgtp61 pwgtp62 pwgtp63 pwgtp64 pwgtp65 pwgtp66 pwgtp67
## 1       3       5       3      28       8      16      17       2       2
## 2       2       4       2      30       7      19      17       2       3
##   pwgtp68 pwgtp69 pwgtp70 pwgtp71 pwgtp72 pwgtp73 pwgtp74 pwgtp75 pwgtp76
## 1       4      27       3       1       1       3      10       3       9
## 2       2      31       2       2       3       3      10       2       9
##   pwgtp77 pwgtp78 pwgtp79 pwgtp80
## 1       9      15      15       3
## 2       8      19      17       4

dim(old_data)

## [1] 188767    279

# creating a subset with only 8 variables

new_data <- data.frame(old_data[c("AGEP", "CIT", "COW", "SCHG", "SCHL", "SEX", "PERNP", "PINCP")])
names(new_data) <- c("Age", "Citizenship_Status", "Worker_Class",  "School_Attending", "Educational_Attainment", "SEX", "Total_Personal_Earnings", "Total_Personal_Income")

# creating a .csv file in GitHub with the subset of data.
write.csv(new_data, file = "C:/Users/Nabila/Documents/GitHub/Class-DATA606/Project/2009PUMS_PERSON_DATA_NY.csv")

#Getting the data from GitHub
PUMS_NY <- read.csv(file="https://raw.githubusercontent.com/nabilahossain/Class-DATA606/master/Project/2009PUMS_PERSON_DATA_NY.csv", header=TRUE, sep=",")
head(PUMS_NY)

##   X Age Citizenship_Status Worker_Class School_Attending
## 1 1  79                  4           NA               NA
## 2 2  75                  3           NA               NA
## 3 3  68                  4           NA               NA
## 4 4  68                  1           NA               NA
## 5 5  69                  1           NA               NA
## 6 6  46                  4            1               NA
##   Educational_Attainment SEX Total_Personal_Earnings Total_Personal_Income
## 1                     15   1                       0                  9300
## 2                      1   2                       0                  3600
## 3                     22   2                       0                  3800
## 4                     22   2                       0                 84200
## 5                     23   1                       0                 92500
## 6                     16   2                    3800                  3800

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

Answer: I have two questions in mind: 1) Does education have a role in a persons’ income/earnings? Does it change if you are a female or if you are over 35? 2) Do people with higher education have higher income/earnings? Does it change by citizenship status or by which Class of worker you are?

Cases

What are the cases, and how many are there? Answer: The original data contain 188,767 cases with 279 variables. Since it is too big to put in GitHub, I will keep all the cases, and work with only 8-9 variables. I put the final data set that I will be working with in GitHub. Each case is a response from one person. The variable that I choose are:“Age”, “Citizenship Status”, “Worker Class”, “School Attending”, “Educational Attainment”, “SEX”, “Total Personal Earnings”, and “Total Personal Income”

Data collection

Describe the method of data collection.

The data was obtained by the American Community Survey (ACS). The data is personal information from people leaving in New York State in 2009. Here is a description of the data taken from the PDF that comes with the data:

“The Public Use Microdata Sample (PUMS) contains a sample of actual responses to the American Community Survey (ACS). The PUMS dataset includes variables for nearly every question on the survey… Each record in the file represents a single person… In the person-level file, individuals are organized into households, making possible the study people within the contexts of their families and other household members. The PUMS contains data on approximately one percent of the United States population.”

Type of study

What type of study is this (observational/experiment)?

Answer: This is an observational study, where people are survived about themselves and their life.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

Answer: I obtained the data from: DATA.GOV (“The home of the U.S. Government’s open data”) https://catalog.data.gov/dataset/2009-american-community-survey-1-year-pums-housing-file The data can also be obtained from United States Census Bureau at: http://www.census.gov/programs-surveys/acs/data/pums.html I obtained the details about the data and codes for variables from: https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.2009.html

Response

What is the response variable, and what type is it (numerical/categorical)? Answer: The response variable(s) that I choose is “Total Personal Earnings” and “Total Personal Income”. I do not know which one is the best to use for this data. Both of these variables are numerical, discrete (rounded to the nearest dollar).

Explanatory

What is the explanatory variable, and what type is it (numerical/categorical)? Answer: The explanatory variables are “Age”, “Citizenship Status”, “Worker Class”, “School Attending”, “Educational Attainment”, and “SEX”. “Age” is numerical, discrete and sex is catagorical. The rest of the variables are categorical, but also can be numerical.

Relevant summary statistics

Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

summary(PUMS_NY)

##        X               Age        Citizenship_Status  Worker_Class  
##  Min.   :     1   Min.   : 0.00   Min.   :1.000      Min.   :1.00   
##  1st Qu.: 47193   1st Qu.:20.00   1st Qu.:1.000      1st Qu.:1.00   
##  Median : 94384   Median :41.00   Median :1.000      Median :1.00   
##  Mean   : 94384   Mean   :40.04   Mean   :1.632      Mean   :2.18   
##  3rd Qu.:141576   3rd Qu.:58.00   3rd Qu.:1.000      3rd Qu.:3.00   
##  Max.   :188767   Max.   :94.00   Max.   :5.000      Max.   :9.00   
##                                                      NA's   :77675  
##  School_Attending Educational_Attainment      SEX       
##  Min.   : 1.0     Min.   : 1.00          Min.   :1.000  
##  1st Qu.: 6.0     1st Qu.:13.00          1st Qu.:1.000  
##  Median :11.0     Median :17.00          Median :2.000  
##  Mean   : 9.8     Mean   :15.88          Mean   :1.521  
##  3rd Qu.:15.0     3rd Qu.:20.00          3rd Qu.:2.000  
##  Max.   :16.0     Max.   :24.00          Max.   :2.000  
##  NA's   :141365   NA's   :6108                          
##  Total_Personal_Earnings Total_Personal_Income
##  Min.   : -7400          Min.   : -13200      
##  1st Qu.:     0          1st Qu.:   7000      
##  Median : 12000          Median :  22400      
##  Mean   : 31315          Mean   :  38837      
##  3rd Qu.: 42100          3rd Qu.:  50000      
##  Max.   :957000          Max.   :1225000      
##  NA's   :35877           NA's   :33251

library(psych)

## Warning: package 'psych' was built under R version 3.2.3

describe(PUMS_NY)

##                         vars      n     mean       sd median  trimmed
## X                          1 188767 94384.00 54492.48  94384 94384.00
## Age                        2 188767    40.04    23.29     41    39.52
## Citizenship_Status         3 188767     1.63     1.33      1     1.32
## Worker_Class               4 111092     2.18     1.88      1     1.77
## School_Attending           5  47402     9.80     4.86     11    10.12
## Educational_Attainment     6 182659    15.88     5.71     17    16.66
## SEX                        7 188767     1.52     0.50      2     1.53
## Total_Personal_Earnings    8 152890 31314.95 60186.50  12000 20004.14
## Total_Personal_Income      9 155516 38837.03 63795.86  22400 27510.40
##                              mad    min     max   range  skew kurtosis
## X                       69966.86      1  188767  188766  0.00    -1.20
## Age                        28.17      0      94      94  0.12    -0.93
## Citizenship_Status          0.00      1       5       4  1.73     1.23
## Worker_Class                0.00      1       9       8  1.63     1.72
## School_Attending            5.93      1      16      15 -0.37    -1.23
## Educational_Attainment      4.45      1      24      23 -1.07     0.39
## SEX                         0.00      1       2       1 -0.08    -1.99
## Total_Personal_Earnings 17791.20  -7400  957000  964400  5.93    50.84
## Total_Personal_Income   27724.62 -13200 1225000 1238200  6.04    53.62
##                             se
## X                       125.42
## Age                       0.05
## Citizenship_Status        0.00
## Worker_Class              0.01
## School_Attending          0.02
## Educational_Attainment    0.01
## SEX                       0.00
## Total_Personal_Earnings 153.93
## Total_Personal_Income   161.77

hist(PUMS_NY$Total_Personal_Earnings)
hist(PUMS_NY$Total_Personal_Income)
hist(PUMS_NY$Educational_Attainment)
hist(PUMS_NY$Age)