library(readr)
urlfile<-"https://raw.githubusercontent.com/alnoel/CUNYSPS-Data607/main/globaldataset_20200414.csv"
global<-read_csv(url(urlfile))
## Rows: 48801 Columns: 64
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): Datasource, gender, ageBroad, majorityStatus, majorityStatusAtExpl...
## dbl (50): yearOfRegistration, meansOfControlDebtBondage, meansOfControlTakes...
## lgl (1): By using this data you agree to the Terms of Use: https://www.ctda...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#subsetting the data to contain only the variables related to means of control. The
control<-global[grepl("meansOfControl",names(global))]
control<-control[-18:-19]
#dataframe "tally" is created to tally up incidents per means of control
tally<-control%>%
gather(x,value,meansOfControlDebtBondage:meansOfControlOther)%>%
group_by(x)%>%
tally(value==1)
#finding location in dataframe "tally" of max incidents
loc<-which(tally==max(tally[,2]),arr.ind = TRUE)
#answer to question 1: Most used means of control
control_max_tally<-tally[loc[1,1],1]
#The results below shows that Psychological Abuse is the means of control most used now.
control_max_tally
## # A tibble: 1 × 1
## x
## <chr>
## 1 meansOfControlPsychologicalAbuse
library(tidyr)
control$gender<-global$gender
control
## # A tibble: 48,801 × 18
## meansOfControlDebtBondage meansOfControlTa… meansOfControlR… meansOfControlT…
## <dbl> <dbl> <dbl> <dbl>
## 1 -99 -99 -99 -99
## 2 -99 -99 -99 -99
## 3 -99 -99 -99 -99
## 4 -99 -99 -99 -99
## 5 -99 -99 -99 -99
## 6 -99 -99 -99 -99
## 7 -99 -99 -99 -99
## 8 -99 -99 -99 -99
## 9 -99 -99 -99 -99
## 10 -99 -99 -99 -99
## # … with 48,791 more rows, and 14 more variables:
## # meansOfControlPsychologicalAbuse <dbl>, meansOfControlPhysicalAbuse <dbl>,
## # meansOfControlSexualAbuse <dbl>, meansOfControlFalsePromises <dbl>,
## # meansOfControlPsychoactiveSubstances <dbl>,
## # meansOfControlRestrictsMovement <dbl>,
## # meansOfControlRestrictsMedicalCare <dbl>,
## # meansOfControlExcessiveWorkingHours <dbl>, …
#First, checking to see how many distinct values exist in dataframe. In this data set, data of Male and Female exist.
unique(control[c("gender")])
## # A tibble: 2 × 1
## gender
## <chr>
## 1 Female
## 2 Male
#transform dataframe to long form
control_long<-control%>%
pivot_longer(meansOfControlDebtBondage:meansOfControlOther,names_to="Control Type",values_to="occurence")
#summarizes two more columns that meet the conditional statements... female or male and has an occurence of 1
tally_gender<-control_long%>%
group_by(`Control Type`)%>%
summarise(
female=sum(gender=="Female" & occurence==1),
male=sum(gender=="Male" & occurence==1)
)
tally_gender
## # A tibble: 17 × 3
## `Control Type` female male
## <chr> <int> <int>
## 1 meansOfControlDebtBondage 828 453
## 2 meansOfControlExcessiveWorkingHours 856 1301
## 3 meansOfControlFalsePromises 1353 1403
## 4 meansOfControlOther 1762 111
## 5 meansOfControlPhysicalAbuse 2982 650
## 6 meansOfControlPsychoactiveSubstances 2224 58
## 7 meansOfControlPsychologicalAbuse 3549 1296
## 8 meansOfControlRestrictsFinancialAccess 116 0
## 9 meansOfControlRestrictsMedicalCare 582 701
## 10 meansOfControlRestrictsMovement 3231 1184
## 11 meansOfControlSexualAbuse 1548 46
## 12 meansOfControlTakesEarnings 1444 1332
## 13 meansOfControlThreatOfLawEnforcement 389 390
## 14 meansOfControlThreats 2941 1031
## 15 meansOfControlUsesChildren 117 3
## 16 meansOfControlWithholdsDocuments 1024 1052
## 17 meansOfControlWithholdsNecessities 798 571
#finds the location of max occurence and returns the control type
loc_female_max<-which(tally_gender==max(tally_gender[,2]),arr.ind = TRUE)
female_max<-tally_gender[loc_female_max[1],1]
loc_male_max<-which(tally_gender==max(tally_gender[,3]),arr.ind = TRUE)
male_max<-tally_gender[loc_male_max[1],1]
#The results below shows that Psychological Abuse has the highest tally for women out of all means of control.
female_max
## # A tibble: 1 × 1
## `Control Type`
## <chr>
## 1 meansOfControlPsychologicalAbuse
#The results below shows that False Promises has the highest tally for men out of all means of control.
male_max
## # A tibble: 1 × 1
## `Control Type`
## <chr>
## 1 meansOfControlFalsePromises
https://www.pewforum.org/religious-landscape-study/income-distribution/#religious-tradition
1.Currently the data reports the proportion of people per income range identify with a Religiuos group. The data can tidyed in a different way in order to assess what proportion of people per religious group fits into a certain income category. Out of each religious group total, which income range has the greatest proportion of people?
library(readr)
library(tidyr)
library(stringr)
urlfile<-"https://raw.githubusercontent.com/catcho1632/607-project-2/main/income_religion.csv"
religion<-read_csv(url(urlfile))
## Rows: 12 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Religious tradition, Less than $30,000, $30,000-$49,999, $50,000-$9...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#converting characters to numeric values
religion$`Less than $30,000`<-as.numeric(str_replace_all(religion$`Less than $30,000`,"%",""))/100
religion$`$30,000-$49,999`<-as.numeric(str_replace_all(religion$`$30,000-$49,999`,"%",""))/100
religion$`$50,000-$99,999`<-as.numeric(str_replace_all(religion$`$50,000-$99,999`,"%",""))/100
religion$`$100,000 or more`<-as.numeric(str_replace_all(religion$`$100,000 or more`,"%",""))/100
#converting percentages to frequency
religion$`Less than $30,000`<-religion$`Less than $30,000`*religion$`Sample Size`
religion$`$30,000-$49,999`<-religion$`$30,000-$49,999`*religion$`Sample Size`
religion$`$50,000-$99,999`<-religion$`$50,000-$99,999`*religion$`Sample Size`
religion$`$100,000 or more`<-religion$`$100,000 or more`*religion$`Sample Size`
#converting to long form
religion_long<-religion%>%
pivot_longer(`Less than $30,000`:`$100,000 or more`,names_to="salary_range",values_to="frequency")
#converting to wide form to "transpose" the dataframe
religion_wide<-religion_long%>%
pivot_wider(names_from = `Religious tradition`,values_from=frequency)
religion_wide<-select(religion_wide,-1)
#transposed dataframe
religion_wide
## # A tibble: 48 × 13
## salary_range Buddhist Catholic `Evangelical Protestant` Hindu `Historically B…
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Less than $30,000 83.9 NA NA NA NA
## 2 $30,000-$49,999 41.9 NA NA NA NA
## 3 $50,000-$99,999 74.6 NA NA NA NA
## 4 $100,000 or more 30.3 NA NA NA NA
## 5 Less than $30,000 NA 2209. NA NA NA
## 6 $30,000-$49,999 NA 1166. NA NA NA
## 7 $50,000-$99,999 NA 1596. NA NA NA
## 8 $100,000 or more NA 1166. NA NA NA
## 9 Less than $30,000 NA NA 2612. NA NA
## 10 $30,000-$49,999 NA NA 1642. NA NA
## # … with 38 more rows, and 7 more variables: Jehovah's Witness <dbl>,
## # Jewish <dbl>, Mainline Protestant <dbl>, Mormon <dbl>, Muslim <dbl>,
## # Orthodox Christian <dbl>, Unaffiliated (religious "nones") <dbl>
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
religion_t<-data.table(religion_wide)[,lapply(.SD,function(x) x[order(is.na(x))])]
religion_t<-religion_t[1:4,]
religion_t
## salary_range Buddhist Catholic Evangelical Protestant Hindu
## 1: Less than $30,000 83.88 2209.32 2611.70 29.24
## 2: $30,000-$49,999 41.94 1166.03 1641.64 22.36
## 3: $50,000-$99,999 74.56 1595.62 2089.36 58.48
## 4: $100,000 or more 30.29 1166.03 1044.68 61.92
## Historically Black Protestant Jehovah's Witness Jewish Mainline Protestant
## 1: 903.12 99.84 113.28 1510.32
## 2: 374.88 52.00 106.20 1041.60
## 3: 289.68 45.76 169.92 1458.24
## 4: 136.32 8.32 311.52 1197.84
## Mormon Muslim Orthodox Christian Unaffiliated (religious "nones")
## 1: 160.38 69.70 27.90 2240.7
## 2: 118.80 34.85 26.35 1358.0
## 3: 196.02 59.45 55.80 1765.4
## 4: 118.80 41.00 44.95 1425.9
#Adding updated Sample Size column
religion_t$row_sum=rowSums(religion_t[,c(-1)])
#calculating proportion per salary_range
religion_final<-religion_t%>%
mutate(
across(c(2:13),
.fns=~./row_sum)
)
religion_final
## salary_range Buddhist Catholic Evangelical Protestant Hindu
## 1: Less than $30,000 0.008338486 0.2196278 0.2596283 0.002906740
## 2: $30,000-$49,999 0.007007929 0.1948368 0.2743084 0.003736225
## 3: $50,000-$99,999 0.009488069 0.2030493 0.2658797 0.007441823
## 4: $100,000 or more 0.005420961 0.2086828 0.1869650 0.011081740
## Historically Black Protestant Jehovah's Witness Jewish
## 1: 0.08977889 0.009925065 0.01126113
## 2: 0.06264025 0.008688896 0.01774540
## 3: 0.03686298 0.005823150 0.02162302
## 4: 0.02439701 0.001489019 0.05575232
## Mainline Protestant Mormon Muslim Orthodox Christian
## 1: 0.1501405 0.01594333 0.006928856 0.002773531
## 2: 0.1740453 0.01985078 0.005823231 0.004402931
## 3: 0.1855671 0.02494436 0.007565259 0.007100781
## 4: 0.2143758 0.02126148 0.007337716 0.008044642
## Unaffiliated (religious "nones") row_sum
## 1: 0.2227473 10059.38
## 2: 0.2269139 5984.65
## 3: 0.2246545 7858.29
## 4: 0.2551914 5587.57
#The following table can be further manipulated but now the table can be analyzed per income group rather than the income distribution within each group.The max proportion associated to each income group is listed in row_max. It appears that majority of the income groups fit into the evangelical protestant religion groups except for the highest earning group of $100,000 or more. In this case, the unaffiliated group has the majority.
library(matrixStats)
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
religion_final$row_max=rowMaxs(as.matrix(religion_final[,c(-1,-14)]))
religion_final
## salary_range Buddhist Catholic Evangelical Protestant Hindu
## 1: Less than $30,000 0.008338486 0.2196278 0.2596283 0.002906740
## 2: $30,000-$49,999 0.007007929 0.1948368 0.2743084 0.003736225
## 3: $50,000-$99,999 0.009488069 0.2030493 0.2658797 0.007441823
## 4: $100,000 or more 0.005420961 0.2086828 0.1869650 0.011081740
## Historically Black Protestant Jehovah's Witness Jewish
## 1: 0.08977889 0.009925065 0.01126113
## 2: 0.06264025 0.008688896 0.01774540
## 3: 0.03686298 0.005823150 0.02162302
## 4: 0.02439701 0.001489019 0.05575232
## Mainline Protestant Mormon Muslim Orthodox Christian
## 1: 0.1501405 0.01594333 0.006928856 0.002773531
## 2: 0.1740453 0.01985078 0.005823231 0.004402931
## 3: 0.1855671 0.02494436 0.007565259 0.007100781
## 4: 0.2143758 0.02126148 0.007337716 0.008044642
## Unaffiliated (religious "nones") row_sum row_max
## 1: 0.2227473 10059.38 0.2596283
## 2: 0.2269139 5984.65 0.2743084
## 3: 0.2246545 7858.29 0.2658797
## 4: 0.2551914 5587.57 0.2551914
The West Nile Virus dataset collected in the US by the CDC is summarized in the table below. Does more Neuroinvasive cases or Non-neuroinvasive cases lead to death?
library(stringr)
urlfile<-"https://raw.githubusercontent.com/catcho1632/607-project-2/main/West_Nile_Virus.csv"
virus<-read_csv(url(urlfile))
## New names:
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * `` -> ...7
## Rows: 24 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Neuroinvasive disease, Non-neuroinvasive disease, Total, ...4, ...5...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
virus<-virus[3:23,1:5]
colnames(virus)<-c("year","Neuroinvasive_cases","Neuroinvasive_deaths","Non-Neuroinvasive_cases","Non-Neuroinvasive_deaths")
#String extract using regex for Neuroinvasive cases
virus$Neuroinvasive_cases<-str_replace_all(virus$Neuroinvasive_cases,",","")
virus$Neuroinvasive_cases<-as.numeric(str_extract_all(virus$Neuroinvasive_cases,"\\d+",simplify=TRUE))
#String extract using regex for Neuroinvasive deaths
virus$Neuroinvasive_deaths<-str_replace_all(virus$Neuroinvasive_deaths,"\\((\\d\\d|\\d)\\)","")
virus$Neuroinvasive_deaths<-str_extract_all(virus$Neuroinvasive_deaths,"\\d+|\\d",simplify=TRUE)
virus$Neuroinvasive_deaths<-as.numeric(unlist(str_extract(virus$Neuroinvasive_deaths, "\\d+|\\d")))
#String extract using regex for Non-Neuroinvasive cases
virus$`Non-Neuroinvasive_cases`<-str_replace_all(virus$`Non-Neuroinvasive_cases`,",","")
virus$`Non-Neuroinvasive_cases`<-as.numeric(str_extract_all(virus$`Non-Neuroinvasive_cases`,"\\d+|\\d",simplify = TRUE))
#String extract using regex for Non-Neuroinvasive deaths
virus$`Non-Neuroinvasive_deaths`<-str_replace_all(virus$`Non-Neuroinvasive_deaths`,"\\((\\d\\d|\\d)\\)","")
virus$`Non-Neuroinvasive_deaths`<-str_replace_all(virus$`Non-Neuroinvasive_deaths`,"\\(\\<\\d\\)","")
virus$`Non-Neuroinvasive_deaths`<-str_extract_all(virus$`Non-Neuroinvasive_deaths`,"\\d+|\\d",simplify=TRUE)
virus$`Non-Neuroinvasive_deaths`<-as.numeric(unlist(str_extract(virus$`Non-Neuroinvasive_deaths`, "\\d+|\\d")))
#converting deaths to proportions of cases that lead to death
virus$Neuroinvasive_deaths<-virus$Neuroinvasive_deaths/virus$Neuroinvasive_cases
virus$`Non-Neuroinvasive_deaths`<-virus$`Non-Neuroinvasive_deaths`/virus$`Non-Neuroinvasive_cases`
#The proportions are weighted per cases in a year
virus$Neuro_x_weight<-virus$Neuroinvasive_deaths*virus$Neuroinvasive_cases
virus$Non_Neuro_x_weight<-virus$`Non-Neuroinvasive_deaths`*virus$`Non-Neuroinvasive_cases`
Neuro_weighted_avg<-round(sum(virus$Neuro_x_weight)/sum(virus$Neuroinvasive_cases)*100)
Non_Neuro_weighted_avg<-round(sum(virus$Non_Neuro_x_weight)/sum(virus$`Non-Neuroinvasive_cases`)*100)
Neuro_weighted_avg
## [1] 9
Non_Neuro_weighted_avg
## [1] 0
#It is apparent that more neuroinvasive cases lead to death than non-neuroinvasive cases. It appears that neuroinvasive cases recover 90% of the time and 99.5% of the time non-neuroinvasive cases.