STATS-midterm.knit

library(haven)
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(scales)
library(sur)

anes2020<-read_dta("C:\\Users\\BTP\\Downloads\\anes2020.dta")

anes2020$V201507x[anes2020$V201507x <0] <- NA
anes2020$V201600[anes2020$V201600 <0] <- NA
anes2020$V201231x[anes2020$V201231x <0] <- NA
anes2020$V202468x[anes2020$V202468x  <0] <- NA
anes2020$V202144[anes2020$V202144 <0] <- NA

anes2020 <- filter(anes2020, V201507x >= 35 & V201507x < 40)

anes2020 %>%
ggplot(mapping = aes(V202144))+
geom_histogram()+
ggtitle(label="Distribution of Simulated Salaries")+
xlab(label="Salaries")

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 79 rows containing non-finite values (stat_bin).

anes2020 %>%
ggplot(data = anes2020,mapping = aes(V202144, stat=..density..))+geom_density()+ggtitle(label="Distribution of Salaries")+xlab(label="Salaries")

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

## Warning: Removed 79 rows containing non-finite values (stat_density).

qqnorm(anes2020$V202144)

summary(anes2020$V202144)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00   15.00   35.04   70.00  100.00      79

anes2020 %>%
tabyl (V202144)

##  V202144   n     percent valid_percent
##        0 283 0.381916329   0.427492447
##        1   4 0.005398111   0.006042296
##        5   6 0.008097166   0.009063444
##       10   6 0.008097166   0.009063444
##       15  48 0.064777328   0.072507553
##       20   4 0.005398111   0.006042296
##       30  21 0.028340081   0.031722054
##       33   1 0.001349528   0.001510574
##       35   3 0.004048583   0.004531722
##       40  34 0.045883941   0.051359517
##       45   4 0.005398111   0.006042296
##       50  26 0.035087719   0.039274924
##       51   1 0.001349528   0.001510574
##       55   2 0.002699055   0.003021148
##       59   1 0.001349528   0.001510574
##       60  28 0.037786775   0.042296073
##       65   2 0.002699055   0.003021148
##       66   1 0.001349528   0.001510574
##       70  36 0.048582996   0.054380665
##       72   1 0.001349528   0.001510574
##       75   5 0.006747638   0.007552870
##       80   2 0.002699055   0.003021148
##       82   1 0.001349528   0.001510574
##       85  48 0.064777328   0.072507553
##       90   7 0.009446694   0.010574018
##       95   3 0.004048583   0.004531722
##       96   1 0.001349528   0.001510574
##       98   1 0.001349528   0.001510574
##       99   1 0.001349528   0.001510574
##      100  81 0.109311741   0.122356495
##       NA  79 0.106612686            NA

cumulative.table(anes2020$V202144)

##        0        1        5       10       15       20       30       33 
## 38.19163 38.73144 39.54116 40.35088 46.82861 47.36842 50.20243 50.33738 
##       35       40       45       50       51       55       59       60 
## 50.74224 55.33063 55.87045 59.37922 59.51417 59.78408 59.91903 63.69771 
##       65       66       70       72       75       80       82       85 
## 63.96761 64.10256 68.96086 69.09582 69.77058 70.04049 70.17544 76.65317 
##       90       95       96       98       99      100 
## 77.59784 78.00270 78.13765 78.27260 78.40756 89.33873

The average value for the variable V202144 is 35.0377644.

anes2020$subgroup <-paste(anes2020$V201509, anes2020$V201600, sep = "" )

summary(anes2020$subgroup)

##    Length     Class      Mode 
##       741 character character

anes2020$subgroupcat <-car::Recode(anes2020$ subgroup, recodes="'-11' = 'single men'; '-12' = 'single women'; '11' = 'Cohab men'; '12' = 'Cohab women'; '21' = 'NonCohab men'; '22' = 'NonCohab women'; else=NA", as.factor=T)


anes2020 %>% 
tabyl (subgroupcat)

##     subgroupcat   n     percent valid_percent
##       Cohab men  42 0.056680162    0.05698779
##     Cohab women  41 0.055330634    0.05563094
##    NonCohab men  88 0.118758435    0.11940299
##  NonCohab women 124 0.167341430    0.16824966
##      single men 223 0.300944669    0.30257802
##    single women 219 0.295546559    0.29715061
##            <NA>   4 0.005398111            NA

anes2020 %>%
tabyl(V201509,V201600,show_missing_levels=F,show_na = FALSE) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits=2) %>%
adorn_ns() %>%
knitr::kable()

V201509	1	2
-1	63.17% (223)	57.03% (219)
1	11.90% (42)	10.68% (41)
2	24.93% (88)	32.29% (124)

anes2020 %>%
tabyl(subgroup)

##  subgroup   n     percent
##       -11 223 0.300944669
##       -12 219 0.295546559
##      -1NA   2 0.002699055
##        11  42 0.056680162
##        12  41 0.055330634
##        21  88 0.118758435
##        22 124 0.167341430
##       2NA   2 0.002699055

anes2020 %>% 
ggplot(mapping=aes(y=V202144,x=subgroupcat))+ geom_boxplot()+
ggtitle(label="Distribution of Trump Feelings")+
xlab(label="Relationship status")+ 
ylab(label="Trump Approval Scale")

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

## Warning: Removed 79 rows containing non-finite values (stat_boxplot).

anes2020 %>%
tabyl(V202468x)

##  V202468x  n    percent valid_percent
##         1 68 0.09176788    0.09315068
##         2 16 0.02159244    0.02191781
##         3 16 0.02159244    0.02191781
##         4 20 0.02699055    0.02739726
##         5 17 0.02294197    0.02328767
##         6 33 0.04453441    0.04520548
##         7 19 0.02564103    0.02602740
##         8 16 0.02159244    0.02191781
##         9 17 0.02294197    0.02328767
##        10 41 0.05533063    0.05616438
##        11 38 0.05128205    0.05205479
##        12 15 0.02024291    0.02054795
##        13 27 0.03643725    0.03698630
##        14 18 0.02429150    0.02465753
##        15 43 0.05802969    0.05890411
##        16 39 0.05263158    0.05342466
##        17 61 0.08232119    0.08356164
##        18 47 0.06342780    0.06438356
##        19 42 0.05668016    0.05753425
##        20 35 0.04723347    0.04794521
##        21 56 0.07557355    0.07671233
##        22 46 0.06207827    0.06301370
##        NA 11 0.01484480            NA

anes2020$houseinc <-as.numeric(anes2020$V202468x, recodes="1 ='5,000'; 2 ='12,000'; 3 ='17,000'; 4 ='22,000'; 5 ='27,000'; 6 ='32,000'; 7 ='37,000'; 8 ='42,000'; 9 ='47,000'; 10 ='52,000'; 11 ='62,000'; 12 ='67,000'; 13 ='72,000'; 14 ='77,000'; 15 ='85,000'; 16 ='95,000'; 17 ='105,000'; 18 ='115,000'; 19 ='135,000'; 20 ='165,000'; 21 ='200,000'; 22 ='250,000'; else=NA", as.factor=T)

scatter.smooth(anes2020$V202144,anes2020$houseinc)

cor(anes2020$V202144, anes2020$houseinc, use = "complete.obs")

## [1] -0.1058004

lmTrump = lm(V202144~houseinc, data = anes2020)
summary(lmTrump)

## 
## Call:
## lm(formula = V202144 ~ houseinc, data = anes2020)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.89 -32.38 -17.27  34.72  70.73 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  42.4902     3.2031  13.265  < 2e-16 ***
## houseinc     -0.6009     0.2210  -2.719  0.00672 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.09 on 653 degrees of freedom
##   (86 observations deleted due to missingness)
## Multiple R-squared:  0.01119,    Adjusted R-squared:  0.009679 
## F-statistic: 7.392 on 1 and 653 DF,  p-value: 0.006725

anes2020 %>%
tabyl (V201231x)

##  V201231x   n    percent
##         1 158 0.21322537
##         2  93 0.12550607
##         3 113 0.15249663
##         4 109 0.14709852
##         5  80 0.10796221
##         6  69 0.09311741
##         7 119 0.16059379

anes2020 %>%
tabyl(V201231x,subgroupcat,show_missing_levels=F,show_na = FALSE) %>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits=2) %>%
adorn_ns() %>%
knitr::kable()

V201231x	Cohab men	Cohab women	NonCohab men	NonCohab women	single men	single women
1	21.43% (9)	24.39% (10)	27.27% (24)	27.42% (34)	15.70% (35)	21.00% (46)
2	9.52% (4)	14.63% (6)	10.23% (9)	21.77% (27)	6.28% (14)	15.07% (33)
3	16.67% (7)	19.51% (8)	19.32% (17)	12.90% (16)	16.59% (37)	12.79% (28)
4	14.29% (6)	14.63% (6)	20.45% (18)	16.94% (21)	13.90% (31)	10.50% (23)
5	21.43% (9)	4.88% (2)	4.55% (4)	6.45% (8)	15.25% (34)	10.50% (23)
6	2.38% (1)	9.76% (4)	10.23% (9)	6.45% (8)	12.56% (28)	8.68% (19)
7	14.29% (6)	12.20% (5)	7.95% (7)	8.06% (10)	19.73% (44)	21.46% (47)