Dalam proyek ini saya memberikan anda dataset insurance.csv, informasi lanjut mengenai data ini dapat anda baca di Kaggle.
Tugas kalian adalah sebagai berikut:
# memasukan data
insurance <- read.csv("Insurance.csv", header = TRUE, sep = ",")
datatable(insurance, colnames = c('Age', 'Sex', 'BMI', 'Children', 'Smoker', 'Region', 'Charges'))## [1] "age" "sex" "bmi" "children" "smoker" "region" "charges"
Dataset ini memiliki 7 variabel:
Kemudian kita harus melihat struktur dari dataset kita, apakah strukturnya sudah benar atau belum. Berikut ini adalah struktur dari dataset:
dikarenakan struktur data yang saya miliki masih berantakan, maka kita harus merapikannya sesuai dnegan identiats setiap variabel, apakah data tersebut numeric atau character. Sehingga data bisa diolah dengan baik.
insurance$smoker <- as.factor(insurance$smoker)
insurance$region <- as.factor(insurance$region)
str(insurance)## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
Dalam setiap proses pengolahan data, sering kita melakuan kesalahan pemeriksaan data yang hilang (missing values) sebelum melakukan proses analisa. Secara sederhana dapat dilakukan pemeriksaan data NA dengan R, sebagai berikut:
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
Ternyata dari hasil pemeriksaan diatas sangat memuaskan karena tidak ada data yang hilang (missing value) pada dataset tersebut.
Untuk mengecek agar tidak adanya data yang sama maka kita lakukan perintah ini
data.frame(
row_of_data = insurance %>% nrow (),
row_of_unique.data = insurance %>% distinct() %>% nrow()
)## row_of_data row_of_unique.data
## 1 1338 1337
Dikarenakan ada data yang sama, maka langkah selanjutnya adalah menghapus atau menghilangkan data yang sama dengan cara
Setelah itu kita cek kembali apakah datanya masih ada yang sama.
data.frame(
row_of_data = insurance %>% nrow (),
row_of_unique.data = insurance %>% distinct() %>% nrow()
)## row_of_data row_of_unique.data
## 1 1337 1337
Dengan tidak ada lagi data yang bermasalah, selanjutnya data yang sudah dibersihkan disimpan untuk proses berikutnya.
Jika ingin melihat ringkasan dari dataset ini, maka kita lakukan.
## age sex bmi children smoker
## Min. :18.00 Length:1337 Min. :15.96 Min. :0.000 no :1063
## 1st Qu.:27.00 Class :character 1st Qu.:26.29 1st Qu.:0.000 yes: 274
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.22 Mean :30.66 Mean :1.096
## 3rd Qu.:51.00 3rd Qu.:34.70 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:324 1st Qu.: 4746
## southeast:364 Median : 9386
## southwest:325 Mean :13279
## 3rd Qu.:16658
## Max. :63770
age_group <- insurance %>%
group_by(age) %>%
summarise(total = n())
Agecut <- cut(age_group$age, c(seq(15, 65, by = 5), Inf), include.lowest = TRUE)
agegroup <- aggregate(total ~ Agecut, age_group, sum)
ggplot(insurance, aes(age)) +
geom_freqpoly(binwidth = 1, color = 'blue') +
geom_histogram(binwidth = 1, fill = 'red', alpha = .5) +
theme_linedraw() + #buat tema
theme(panel.background = element_rect(fill = "gainsboro", colour = "white", size = 0.5, linetype = "solid"), #pengaturan panel tema
plot.background = element_rect(fill = "gainsboro"), #pengaturan panel tema
panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "white"), #pengauran panel tema
panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "white"), #pengaturan panel tema
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'), #pengaturan judul
plot.subtitle = element_text(face = "italic"), #pengaturan subtitle
plot.caption = element_text(size = 6, vjust = -1, face = "italic")) + #caption/credit settings
labs(x = 'Age', y = 'Frequency', title = "Member of Medical Cost Insurance", #judul dan axis
subtitle = "Medical insurance's member aggregated by age") +
guides(fill=FALSE) + #remove color legend
scale_y_continuous(limits = c(0,80), breaks = c(0,20,40,60,80)) #atur batas sumbuDistribusi usia anggota asuransi relatif sama, kecuali 18 dan 19 tahun anggota yang memiliki populasi lebih tinggi (di atas yang usia 60 tahun). Saya juga membuat kelompok usia anggota pada tabel di bawah ini.
## Agecut total
## 1 [15,20] 165
## 2 (20,25] 140
## 3 (25,30] 138
## 4 (30,35] 130
## 5 (35,40] 127
## 6 (40,45] 137
## 7 (45,50] 144
## 8 (50,55] 140
## 9 (55,60] 125
## 10 (60,65] 91
Nah dari hasil diatas juga melihatkan bahwa yang berusia diantara 15-20 totalnya 166 dan lebih banyak daripada yang berusia 60 tahun yang totalnya 91.
sex <- insurance %>%
group_by(sex) %>%
summarise(total = n()) %>%
mutate(percentage = paste0(round(100*total/sum(total),1), "%"))
plot_ly(sex, labels = ~sex, values = ~total, type = 'pie', #plotly package
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste(total, 'people'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1)), showlegend = FALSE) %>%
layout(title = 'Gender of Medical Insurance Member', titlefont = list(size = 18, color = 'black'),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: The titlefont attribute is deprecated. Use title = list(font = ...)
## instead.
Jenis kelamin peserta asuransi hampir sama (laki-laki = 675 orang & perempuan = 662 orang).
reg <- insurance %>%
group_by(region) %>%
summarise(total = n()) %>%
mutate(percentage = paste0(round(100*total/sum(total),1), "%"))
plot_ly(reg, labels = ~region, values = ~total, type = 'pie', #plotly package
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste(total, 'people'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1)), showlegend = FALSE) %>%
layout(title = 'Region of Medical Insurance Member', titlefont = list(size = 18, color = 'black'),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))## Warning: The titlefont attribute is deprecated. Use title = list(font = ...)
## instead.
Wilayah tempat tinggal anggota asuransi tersebar merata.
insurance %>%
group_by(smoker) %>%
summarise(total = n()) %>%
mutate(percentage = paste0(round(100*total/sum(total),1), "%"),
annot = c("Non-Smoker","Smoker")) %>%
ggplot(aes(x=annot, y=total, label = percentage, fill = annot)) +
geom_bar(stat="identity") +
geom_text(hjust = 0.5, vjust = -1, color = "black", fontface = "italic", size = 5) + #label type
theme_linedraw() + #make a theme
theme(panel.background = element_rect(fill = "gainsboro", colour = "white", size = 0.5, linetype = "solid"), #theme panel settings
plot.background = element_rect(fill = "gainsboro"), #theme panel settings
legend.position = "none", #legend position
legend.title = element_blank(), #remove legend title
legend.background = element_rect(fill = "gainsboro", colour = "gainsboro", size = 0.5, linetype = "solid"), #change legend box color
panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "white"), #theme panel settings
panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "white"), #theme panel settings
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'), #title settings
plot.subtitle = element_text(face = "italic")) + #subtitle settings
labs(x = '', y = '', title = "Member of Medical Insurance", #name title and axis
subtitle = 'How many smokers registered as insurance member?') + #name subtitle
scale_y_continuous(limits = c(0,1500), breaks = c(0,300,600,900,1200,1500)) #set axis limits and breakBanyak anggotanya bukan perokok (79,5% atau 1063 orang) dan sisanya sebanyak 274 orang adalah perokok.
child <- insurance %>%
group_by(children) %>%
summarise(total = n()) %>%
mutate(percentage = paste0(round(100*total/sum(total),1), "%"),
annot = c("Zero", "1 Child", "2 Children", "3 Children", "4 Children", "5 Children"))
plot_ly(child, labels = ~annot, values = ~total, type = 'pie', #plotly package
textposition = 'outside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste(total, 'member'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1)), showlegend = FALSE) %>%
layout(title = 'Number of Dependents From Medical Insurance Member', titlefont = list(size = 18, color = 'black'),
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))## Warning: The titlefont attribute is deprecated. Use title = list(font = ...)
## instead.
ggplot(insurance, aes(bmi)) +
geom_histogram(binwidth = 1, fill = 'red', alpha = .5) +
theme_linedraw() + #make a theme
theme(panel.background = element_rect(fill = "gainsboro", colour = "white", size = 0.5, linetype = "solid"), #theme panel settings
plot.background = element_rect(fill = "gainsboro"), #theme panel settings
panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "white"), #theme panel settings
panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "white"), #theme panel settings
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'), #title settings
plot.subtitle = element_text(face = "italic"), #subtitle settings
plot.caption = element_text(size = 6, vjust = -1, face = "italic")) + #caption/credit settings
labs(x = 'Body Mass Index', y = 'Frequency', title = "Member of Medical Cost Insurance", #name title and axis
subtitle = "Body mass index of medical insurance's member") +
guides(fill=FALSE) + #remove color legend
scale_y_continuous(limits = c(0,120), breaks = c(0,20,40,60,80,100,120)) #set axis limits and breakIndeks massa tubuh anggota terdistribusi normal.
ggplot(insurance, aes(charges)) +
geom_histogram(binwidth = 2000, fill = 'red', alpha = .5) +
theme_linedraw() + #make a theme
theme(panel.background = element_rect(fill = "gainsboro", colour = "white", size = 0.5, linetype = "solid"), #theme panel settings
plot.background = element_rect(fill = "gainsboro"), #theme panel settings
panel.grid.major = element_line(size = 0.5, linetype = 'solid', colour = "white"), #theme panel settings
panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "white"), #theme panel settings
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'), #title settings
plot.subtitle = element_text(face = "italic"), #subtitle settings
plot.caption = element_text(size = 6, vjust = -1, face = "italic")) + #caption/credit settings
labs(x = 'Charges', y = 'Frequency', title = "Medical Cost Insurance", #name title and axis
subtitle = "Individual medical costs billed by health insurance") +
guides(fill=FALSE) + #remove color legend
scale_y_continuous(limits = c(0,250), breaks = c(0,50,100,150,200,250)) #set axis limits and breakDistribusi biaya medis individu yang ditagih oleh asuransi kesehatan memiliki kecenderungan positif.
Saya membuat analisis kepadatan ini untuk menemukan variabel-variabel yang mempengaruhi biaya medis. Dari analisis ini (Anda perlu melihat grafik kepadatan), saya mendapatkan beberapa wawasan:
Jenis kelamin tidak berdampak pada asuransi biaya pengobatan karena laki-laki dan perempuan memiliki distribusi / kepadatan yang sama terhadap pungutan (lihat Bagan 5.1).
Daerah tidak terlalu berpengaruh terhadap asuransi biaya kesehatan karena 4 daerah hampir memiliki distribusi / kepadatan yang sama terhadap retribusi (lihat Grafik 5.2).
Perokok dan bukan perokok mempengaruhi asuransi biaya pengobatan karena distribusi / kepadatannya sangat berbeda (lihat Grafik 5.3).
Jumlah anak / tanggungan memiliki kepadatan yang sama terhadap pungutan, kecuali jumlah anak nol. Jadi, jika Anda tidak memiliki anak maka akan berdampak pada asuransi biaya kesehatan (lihat Bagan 5.4).
Berdasarkan wawasan tersebut dapat disimpulkan bahwa Asap, total tanggungan, umur, dan BMI merupakan variabel yang digunakan untuk memprediksi asuransi biaya pengobatan.
Dari pengelompokan persamaan, saya membuat fungsi untuk memprediksi muatan. Fungsinya terlihat seperti:
predict <- function(x){
for(i in 1:nrow(x)){
if(x[i,"smoker"] == "yes" && x[i,"children"] == 0 && x[i,"bmi"] < 30){
x[i,"result"] = -956.74 + (251.20*x[i,"age"]) + (505.18*x[i,"bmi"])
} else if(x[i,"smoker"] == "yes" && x[i,"children"] == 0 && x[i,"bmi"] >= 30) {
x[i,"result"] = 8120.10 + (292.16*x[i,"age"]) + (614.01*x[i,"bmi"])
} else if(x[i,"smoker"] == "yes" && x[i,"children"] > 0 && x[i,"bmi"] < 30){
x[i,"result"] = 2428.48 + (259.48*x[i,"age"]) + (359.27*x[i,"bmi"])
} else if(x[i,"smoker"] == "yes" && x[i,"children"] > 0 && x[i,"bmi"] >= 30){
x[i,"result"] = 16021.03 + (253.72*x[i,"age"]) + (447.91*x[i,"bmi"])
} else if(x[i,"smoker"] == "no" && x[i,"children"] == 0 && x[i,"bmi"] < 30){
x[i,"result"] = -3239.15 + (277.00*x[i,"age"])
} else if(x[i,"smoker"] == "no" && x[i,"children"] == 0 && x[i,"bmi"] >= 30){
x[i,"result"] = -2155.79 + (254.01*x[i,"age"])
} else if(x[i,"smoker"] == "no" && x[i,"children"] > 0 && x[i,"bmi"] < 30){
x[i,"result"] = -884.08 + (247.85*x[i,"age"])
} else {
x[i,"result"] = -2161.36 + (282.54*x[i,"age"])
}
}
return(x)
}Dan hasil prediksi saya terlihat pada tabel di bawah ini
predcharges <- predict(insurance)
datatable(predcharges, colnames = c('Age', 'Sex', 'BMI', 'Children', 'Smoker', 'Region', 'Charges', 'Charges Prediction'))Suatu perusahaan di Amerika Serikat ingin mempekerjakan seseorang dari luar Amerika Serikat untuk posisi teknis, mereka perlu mengajukan aplikasi ke pemerintah Amerika Serikat untuk mendapatkan kartu hijau atau visa bagi pelamar asing. Untuk menunjukkan ekuitas bagi karyawan AS dan non-AS, perusahaan perlu menyatakan seberapa banyak mereka bersedia membayar karyawan ketika mereka mengajukan permohonan visa atau kartu hijau. Sementara itu, mereka perlu memberikan jumlah rata-rata, yang disebut “prevailing wage” seorang karyawan dengan keterampilan dan latar belakang serupa biasanya dibayar untuk posisi yang sama.
Perbedaan antara upah yang dibayar dan upah yang berlaku dapat menunjukkan apakah perusahaan AS bersedia membayar lebih banyak gaji kepada karyawan non-AS. Gaji lebih banyak untuk calon karyawan asing akan menarik. Selain itu, perlu diperhatikan bahwa untuk area dan pekerjaan yang berbeda, gaji dapat menunjukkan perbedaan. Oleh karena itu perlu untuk mencari tahu hubungan antara gaji, area dan posisi dapat membantu karyawan non-AS untuk memilih pekerjaan di AS.
Berdasarkan klasifikasi VISA yang mereka miliki disimpulkan bahwa ada lima jenis yang berbeda: “green card”, “H-1B”, “H-1B1 Chile”, “H-1B1 Singapore” dan “E-3 Australia”. Untuk projek ini, silahkan anda memilih kelas VISA “H-1B” untuk melakukan data mentah pelamar yang berpenduduk tetap tahun 2018 atau 2019. Kalian dapat mendwonload Data asli yang dikumpulkan oleh Kantor Sertifikasi Tenaga Kerja Asing Departemen Tenaga Kerja AS
## This is how to convert and compress the data into Rds for faster input
# Data <- read.csv("H-1B_Disclosure_Data_FY2018_EOY.csv", header = TRUE, sep = ";")
# head(Data)
#
# saveRDS(Data, "H-1B_Disclosure_2018.Rds")
# Visa <- readRDS("H-1B_Disclosure_2018.Rds")
# identical(Data, Visa)## 'data.frame': 654360 obs. of 52 variables:
## $ CASE_NUMBER : chr "I-200-18026-338377" "I-200-17296-353451" "I-200-18242-524477" "I-200-18070-575236" ...
## $ CASE_STATUS : chr "CERTIFIED" "CERTIFIED" "CERTIFIED" "CERTIFIED" ...
## $ CASE_SUBMITTED : chr "29/01/2018" "23/10/2017" "30/08/2018" "" ...
## $ DECISION_DATE : chr "02/02/2018" "27/10/2017" "06/09/2018" "30/03/2018" ...
## $ VISA_CLASS : chr "H-1B" "H-1B" "H-1B" "H-1B" ...
## $ EMPLOYMENT_START_DATE : chr "28/07/2018" "06/11/2017" "10/09/2018" "10/09/2018" ...
## $ EMPLOYMENT_END_DATE : chr "27/07/2021" "06/11/2020" "09/09/2021" "09/09/2021" ...
## $ EMPLOYER_NAME : chr "MICROSOFT CORPORATION" "ERNST & YOUNG U.S. LLP" "LOGIXHUB LLC" "HEXAWARE TECHNOLOGIES, INC." ...
## $ EMPLOYER_BUSINESS_DBA : chr "" "" "" "N/A" ...
## $ EMPLOYER_ADDRESS : chr "1 MICROSOFT WAY" "200 PLAZA DRIVE" "320 DECKER DRIVE" "101 WOOD AVENUE SOUTH" ...
## $ EMPLOYER_CITY : chr "REDMOND" "SECAUCUS" "IRVING" "ISELIN" ...
## $ EMPLOYER_STATE : chr "WA" "NJ" "TX" "NJ" ...
## $ EMPLOYER_POSTAL_CODE : chr "98052" "7094" "75062" "8830" ...
## $ EMPLOYER_COUNTRY : chr "UNITED STATES OF AMERICA" "UNITED STATES OF AMERICA" "UNITED STATES OF AMERICA" "UNITED STATES OF AMERICA" ...
## $ EMPLOYER_PROVINCE : chr "" "" "" "N/A" ...
## $ EMPLOYER_PHONE : chr "4258828080" "2018723003" "2145419305" "6094096950" ...
## $ EMPLOYER_PHONE_EXT : chr "" "" "" "" ...
## $ AGENT_REPRESENTING_EMPLOYER: chr "N" "Y" "N" "Y" ...
## $ AGENT_ATTORNEY_NAME : chr "," "BRADSHAW, MELANIE" "," "DUTOT, CHRISTOPHER" ...
## $ AGENT_ATTORNEY_CITY : chr "" "TORONTO" "" "TROY" ...
## $ AGENT_ATTORNEY_STATE : chr "" "" "" "MI" ...
## $ JOB_TITLE : chr "SOFTWARE ENGINEER" "TAX SENIOR" "DATABASE ADMINISTRATOR" "SOFTWARE ENGINEER" ...
## $ SOC_CODE : chr "15-1132" "13-2011" "15-1141" "15-1132" ...
## $ SOC_NAME : chr "SOFTWARE DEVELOPERS, APPLICATIONS" "ACCOUNTANTS AND AUDITORS" "DATABASE ADMINISTRATORS" "SOFTWARE DEVELOPERS, APPLICATIONS" ...
## $ NAICS_CODE : int 51121 541211 541511 541511 541511 541511 541511 541512 541511 541511 ...
## $ TOTAL_WORKERS : int 1 1 1 5 1 1 1 1 1 1 ...
## $ NEW_EMPLOYMENT : int 0 0 0 5 0 0 0 0 0 0 ...
## $ CONTINUED_EMPLOYMENT : int 1 0 0 0 0 0 1 1 1 0 ...
## $ CHANGE_PREVIOUS_EMPLOYMENT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NEW_CONCURRENT_EMP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CHANGE_EMPLOYER : int 0 1 1 0 0 0 0 0 0 1 ...
## $ AMENDED_PETITION : int 0 0 0 0 1 1 0 0 0 0 ...
## $ FULL_TIME_POSITION : chr "Y" "Y" "Y" "Y" ...
## $ PREVAILING_WAGE : chr "112.549" "79.976" "77.792" "84.406" ...
## $ PW_UNIT_OF_PAY : chr "Year" "Year" "Year" "Year" ...
## $ PW_WAGE_LEVEL : chr "Level II" "Level II" "Level II" "Level II" ...
## $ PW_SOURCE : chr "OES" "OES" "OES" "OES" ...
## $ PW_SOURCE_YEAR : chr "2017" "2017" "2018" "2017" ...
## $ PW_SOURCE_OTHER : chr "OFLC ONLINE DATA CENTER" "OFLC ONLINE DATA CENTER" "OFLC ONLINE DATA CENTER" "OFLC ONLINE DATA CENTER" ...
## $ WAGE_RATE_OF_PAY_FROM : chr "143.915,00" "100.000,00" "78.240,00" "84.406,00" ...
## $ WAGE_RATE_OF_PAY_TO : chr "0" "0" "0" "85.000,00" ...
## $ WAGE_UNIT_OF_PAY : chr "Year" "Year" "Year" "Year" ...
## $ H1B_DEPENDENT : chr "N" "N" "N" "Y" ...
## $ WILLFUL_VIOLATOR : chr "N" "N" "N" "N" ...
## $ SUPPORT_H1B : chr NA NA NA "Y" ...
## $ LABOR_CON_AGREE : chr "" "" "" "" ...
## $ PUBLIC_DISCLOSURE_LOCATION : logi NA NA NA NA NA NA ...
## $ WORKSITE_CITY : chr "REDMOND" "SANTA CLARA" "IRVING" "NEW CASTLE" ...
## $ WORKSITE_COUNTY : chr "KING" "SAN JOSE" "DALLAS" "NEW CASTLE" ...
## $ WORKSITE_STATE : chr "WA" "CA" "TX" "DE" ...
## $ WORKSITE_POSTAL_CODE : chr "98052" "95110" "75062" "19720" ...
## $ ORIGINAL_CERT_DATE : chr "" "" "" "" ...
## CASE_NUMBER CASE_STATUS
## 0 0
## CASE_SUBMITTED DECISION_DATE
## 0 0
## VISA_CLASS EMPLOYMENT_START_DATE
## 0 0
## EMPLOYMENT_END_DATE EMPLOYER_NAME
## 0 0
## EMPLOYER_BUSINESS_DBA EMPLOYER_ADDRESS
## 2641 0
## EMPLOYER_CITY EMPLOYER_STATE
## 0 0
## EMPLOYER_POSTAL_CODE EMPLOYER_COUNTRY
## 0 0
## EMPLOYER_PROVINCE EMPLOYER_PHONE
## 1602 0
## EMPLOYER_PHONE_EXT AGENT_REPRESENTING_EMPLOYER
## 35 0
## AGENT_ATTORNEY_NAME AGENT_ATTORNEY_CITY
## 0 0
## AGENT_ATTORNEY_STATE JOB_TITLE
## 0 0
## SOC_CODE SOC_NAME
## 0 0
## NAICS_CODE TOTAL_WORKERS
## 6 0
## NEW_EMPLOYMENT CONTINUED_EMPLOYMENT
## 0 0
## CHANGE_PREVIOUS_EMPLOYMENT NEW_CONCURRENT_EMP
## 0 0
## CHANGE_EMPLOYER AMENDED_PETITION
## 0 0
## FULL_TIME_POSITION PREVAILING_WAGE
## 0 0
## PW_UNIT_OF_PAY PW_WAGE_LEVEL
## 0 0
## PW_SOURCE PW_SOURCE_YEAR
## 0 0
## PW_SOURCE_OTHER WAGE_RATE_OF_PAY_FROM
## 19 0
## WAGE_RATE_OF_PAY_TO WAGE_UNIT_OF_PAY
## 0 0
## H1B_DEPENDENT WILLFUL_VIOLATOR
## 0 0
## SUPPORT_H1B LABOR_CON_AGREE
## 374615 0
## PUBLIC_DISCLOSURE_LOCATION WORKSITE_CITY
## 654360 0
## WORKSITE_COUNTY WORKSITE_STATE
## 2 0
## WORKSITE_POSTAL_CODE ORIGINAL_CERT_DATE
## 1 0
## jumlah.seluruh.data jumlah.data.unik
## 1 654360 654360
wage <- (Visa$PREVAILING_WAGE)
wage_unit <- (Visa$PW_UNIT_OF_PAY)
area <- (Visa$EMPLOYER_STATE)
position <- (Visa$JOB_TITLE)
main <- cbind(position, wage, wage_unit, area)
main <- as.data.frame(main)
colnames(main) <- c("Position","Wage","Unit","Area")
head(main)## Position Wage Unit Area
## 1 SOFTWARE ENGINEER 112.549 Year WA
## 2 TAX SENIOR 79.976 Year NJ
## 3 DATABASE ADMINISTRATOR 77.792 Year TX
## 4 SOFTWARE ENGINEER 84.406 Year NJ
## 5 MICROSOFT DYNAMICS CRM APPLICATION DEVELOPER 87.714 Year NJ
## 6 SENIOR SYSTEM ARCHITECT 71.864 Year TX
## 'data.frame': 654360 obs. of 4 variables:
## $ Position: chr "SOFTWARE ENGINEER" "TAX SENIOR" "DATABASE ADMINISTRATOR" "SOFTWARE ENGINEER" ...
## $ Wage : chr "112.549" "79.976" "77.792" "84.406" ...
## $ Unit : chr "Year" "Year" "Year" "Year" ...
## $ Area : chr "WA" "NJ" "TX" "NJ" ...
## Warning: NAs introduced by coercion
## Position Wage Unit
## SOFTWARE DEVELOPER : 34907 Min. : 0.00 : 57
## SOFTWARE ENGINEER : 31943 1st Qu.: 64.00 Bi-Weekly: 44
## PROGRAMMER ANALYST : 14109 Median : 80.00 Hour : 44899
## SENIOR SOFTWARE ENGINEER : 8430 Mean : 82.66 Month : 285
## SENIOR SYSTEMS ANALYST JC60: 7041 3rd Qu.: 97.00 Week : 96
## DEVELOPER : 6244 Max. :960.00 Year :608979
## (Other) :551686 NA's :4
## Area
## CA :113307
## NJ : 83892
## TX : 79902
## NY : 45783
## IL : 39231
## PA : 35906
## (Other):256339