library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(readxl)
path = dir("D:/TKT/RImportData", full.names = TRUE)
data_path = path[str_detect(path, "SME")] 
data11 = read.csv("D:/TKT/RImportData/SME2011.xlsx")
data13 = read.csv("D:/TKT/RImportData/SME2013.xlsx")
#Tim tat ca cac file co ten chua "SME" trong file RImportData
str(data_path) 
##  chr [1:5] "D:/TKT/RImportData/SME2011.xlsx" ...
#Xem file
View(data11)
View(data13)
#Xem qua du lieu cho thay cac cot bien sap xep rat lon xon, khong the ghep file ngay
#Xem su trung khop cua cac cot
dim(data11)
## [1] 2656  120
dim(data13)
## [1] 2575  125
c11 = c(colnames(data11))
c13 = c(colnames(data13))
c = 0
meg = function(colna1, colna2) {
  xl = length(colna1)
  yl = length(colna2)
  if (xl < yl) {l = xl} else {l = yl}
  for (i in 15:l) {
    x = colna1[i]
    y = colna2[i - 1]
    if (x == y) { c = c(c, colna1[i])} else {c = c(c, as.character(i))}
  }
  return(c)
}
meg(c11, c13)
##   [1] "0"                 "CEOethnic"         "CEOeducation"     
##   [4] "CEOprofessional"   "CEOexperience"     "CEOpartymember"   
##   [7] "machinetype"       "machine3year"      "machine3to5year"  
##  [10] "machine6to10year"  "machine11to20year" "machine20year"    
##  [13] "outsource"         "subcontractor"     "certificate"      
##  [16] "advertise"         "advertisespent"    "salehousehold"    
##  [19] "saletourist"       "salegovern"        "saledomestic"     
##  [22] "salestate"         "saleFDI"           "saleexport"       
##  [25] "export"            "exporttype"        "exportratio"      
##  [28] "china"             "japan"             "asean"            
##  [31] "asian"             "us"                "eu"               
##  [34] "russia"            "nonasia"           "costmarketing"    
##  [37] "servicepurchase"   "investtotal"       "investland"       
##  [40] "investbuilding"    "investequip"       "investRD"         
##  [43] "investtrain"       "investpatent"      "investfirm"       
##  [46] "investothers"      "assets"            "debt"             
##  [49] "labortotal"        "manager"           "professional"     
##  [52] "productionworker"  "66"                "67"               
##  [55] "networkbusiness1"  "networkbusiness2"  "networkbank"      
##  [58] "networkpolitician" "networkother"      "networktotal"     
##  [61] "association"       "newproduct"        "motivation1"      
##  [64] "modification"      "motivation2"       "newprocess"       
##  [67] "motivation3"       "barrierinnovation" "constraintgrowth" 
##  [70] "governtax"         "governloan"        "governtraining"   
##  [73] "governtrade"       "governquality"     "governother"      
##  [76] "89"                "90"                "91"               
##  [79] "92"                "93"                "94"               
##  [82] "95"                "96"                "97"               
##  [85] "98"                "99"                "100"              
##  [88] "101"               "102"               "103"              
##  [91] "104"               "105"               "106"              
##  [94] "107"               "108"               "109"              
##  [97] "110"               "111"               "112"              
## [100] "113"               "114"               "115"              
## [103] "116"               "117"               "118"              
## [106] "119"               "120"
#Hop nhat du lieu
data11_13 = read_excel("D:/TKT/RImportData/SME2011_2013.xlsx")
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C1635 / R1635C3: got 'Kh«ng cã th«ng tin'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C4803 / R4803C3: got 'Kh«ng cã th«ng tin'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C5067 / R5067C3: got 'Kh«ng cã th«ng tin'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in C5167 / R5167C3: got 'Kh«ng cã th«ng tin'
data11_13 = as.data.frame(data11_13)
#Chuan hoa firmid
colna = colnames(data11_13)
rna = c(4, 4)
dem = 0
dim(data11_13)
## [1] 5229  114
for (i in 2:5228) {
  x = data11_13$firmid[i]
  y = data11_13$firmid[i + 1]
  if (x < y) {
    dem = 0
    dem = dem + 1
    rna = c(rna, y)
  } else if (dem < 2) {
    dem = dem + 1
    rna = c(rna, y)
  }
}
#x la data.frame chua firmid chuan hoa
x = data.frame(firmid = rna)

#Chuan hoa data theo firmid, moi firmid xuat hien nhieu nhat 2 lan, neu qua thi xoa
dem = 0
mydata = data11_13
for (i in 2:5228) {
  x = data11_13$firmid[i]
  y = data11_13$firmid[i + 1]
  z = i + 1
  if (x < y) {
    dem = 0
    dem = dem + 1
  } else if (dem < 2) {
    dem = dem + 1
  } else (mydata = mydata[- z, ])
}
dim(mydata) #Co the thay so dong cua mydata trung voi firmid chuan
## [1] 5109  114
#Xem du lieu da duoc xu li
View(mydata)

#Xuat du lieu duoi dang file excel 
setwd("D:/")
write_excel_csv(mydata, "SME2011_2013_edited.xlsx")