At present the code below deals with an SAWS table-format dataset with the following variables: * Daily Maximum Temperature * Daily Minimum Temperature * Daily Rain

If you have other variables in the file, it should be set to NULL. Please let me know the format so that I can add it to this code. (I’ve just based this on a file that I have at hand.)

Missing data where the weather station was offline are coded as -999 not as NA. I repeat, -999 means there was a weather station issue, whereas NA just means that no data were collected on that day (such as rain as they don’t put 0’s in there dataset).

This is not elegant code. I’ve used base R just so that it should be implementable irrespective of updates beyond the base (and hopefully the base too). I have not included all the station info as it’s not relevant to me. If it’s relevant to you, let me know and I’ll update the code accordingly. Or you could just open the darn file and get it, so don’t ask me rewrite code to get info for 10 stations.

# STEP 1 (outside R): Open the .xls or .xlsx your recieved and save it as a .csv file. 
#                     Open the file in Notepad and make sure there are "," between the 
#                     numbers and not ";". (Microsoft have default a comma-separated file
#                     to be a semicolon separated file for some bizarre reason)
# STEP 2: Put the file directory, file input name and file output name into 
#         CHUNK 1
# STEP 3: RUN CHUNK 2

################################################################################
### CHUNK 1: YOUR JOB
# You need to change each line that ends in #<---
# General directory
wd <- "D:\\Dropbox\\100_PROJECTS\\SAWS.Data" #<---
# File input name
wf <- "Eastern Cape 1970-2020 data.csv" #"MLTB_SAWS.csv" #<---
# Output file name
w_out <- "EC_1970-2020_SAWS.csv" #<---

################################################################################
### CHUNK 2: Run the code
### DON'T CHANGE UNLESS YOU KNOW WHAT YOU'RE DOING
### Just run the code below. 


setwd(wd)
# Reading in the file as is...
raw_lines <- readLines(wf)
length(raw_lines)
## [1] 37909
STAT_NAME <- STAT_ID <- VAR <- YEAR <-DAY <-  NULL
STAT_DAT <- matrix(nrow=0,ncol=12)
#for (i in 1:100) {
for (i in 1:length(raw_lines)) {
  ll <- raw_lines[i]
  #print(i)
  # Needs to be updated if other variables are in the file
  if(grepl("Daily Maximum Temperature",ll)|
     grepl("Daily Minimum Temperature",ll)|
     grepl("Daily Rain",ll)) {
    STAT_NAME_f <-strsplit(ll,"-")[[1]][2] # Broken by Graaf-Reinett, or any similar hyphenated names. Fix manually. 
    STAT_NAME_f <-substr(STAT_NAME_f,2,nchar(STAT_NAME_f)-3)
    STAT_ID_f <- strsplit(ll,"\\[")[[1]][2]
    STAT_ID_f <- strsplit(STAT_ID_f,"\\]")[[1]][1]
    YEAR_f <- strsplit(ll," m ")[[1]][2]
    YEAR_f <- substr(YEAR_f,1,8)
    YEAR_f <- as.numeric(YEAR_f)
  }
  
  # Needs to be updated if other variables are in the file
  if(grepl("Daily Maximum Temperature",ll)) VAR_f<-"Daily Maximum Temperature"
  if(grepl("Daily Minimum Temperature",ll)) VAR_f<-"Daily Minimum Temperature"
  if(grepl("Daily Rain",ll)) VAR_f<-"Daily Rain"

  maybe_data <- suppressWarnings(as.numeric(strsplit(ll,",")[[1]][1]))
  if(maybe_data%in%1:31) {
    ll<-gsub(" ","",ll)
    row_dat_f <- suppressWarnings(unlist(strsplit(ll,",")))
    row_dat_f[row_dat_f=="***"]<-"-999"
    DAY_f <- as.numeric(row_dat_f[1])
    MON_DAT <- as.numeric(row_dat_f[-1])
    STAT_DAT <- rbind(STAT_DAT,MON_DAT)
    STAT_NAME <- c(STAT_NAME,STAT_NAME_f)
    STAT_ID <- c(STAT_ID,STAT_ID_f)
    VAR <- c(VAR,VAR_f)
    YEAR <- c(YEAR,YEAR_f)
    DAY<-c(DAY,DAY_f)
  }
  # Reset the temp names
  if (maybe_data%in%31) {
    STAT_NAME_f<- NULL;STAT_ID_f<- NULL
    VAR_f<- NULL
    YEAR_f <- NULL
  }  
}
colnames(STAT_DAT)<-c("JAN","FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC")
rownames(STAT_DAT)<-NULL
data.frame(STAT_ID,STAT_NAME,VAR,YEAR,DAY,STAT_DAT)-> RES

write.csv(RES,w_out)