######################################################################
######################################################################
#
# READ DATA, BUILD SUITABLE DATA FRAMES
#
######################################################################
# author: M. Rehr
# LaMo: 2013-03-11
######################################################################

  
#########################
# Outlier Detection 
#########################

######################################################################
#
# raw 2010 data, DEM, MS Access Databases (2007) 
# [open with 2010 version, but push No-button in pop-up window]
#
#########
#
# countries: Romania(RO): ro-dem, Switzerland(CH): ch-dem, Cyprus(CY): cy-dem
#
# components(cp_number): PM10(5) daily; O3(7), SO2(1), NO2(8), CO(10) hourly
#
#########
######################################################################

# set path
#getwd()
#"C:/Users/mrehr_01/Documents/ETC-ACM/subtask 1.0.1.2-5b/Scripts"
#set to "C:/Users/mrehr_01/Documents/ETC-ACM/subtask 1.0.1.2-5b"
path <- '../'
setwd(path)

# ! first specify ODBC drivers by setting dsn (C:\Windows\SysWOW64\odbcad32.exe):
# choose "Microsoft Access Driver(*.mdb, *,accdb)", 
# set dsn=ro-dem etc and specify respective path 

# compare RODBC manual, appendix B:
RShowDoc("RODBC", package="RODBC")

# also directories for saving data have to be created beforehand:
# "./Data" : 2010 DEM Data, "./Data/2010 DEM Data" : ro, ch, cy

# ! use 32-bit R-version to ensure interoperability 

# specify script path
script_path <- "./Scripts/load_and_organize_data"

######################################################################


# READ DATA 
######################################################################

# read and store data with function read.DEM() from script read.DEM.R
# ! directories for saving data frames have to be created beforehand
source(paste(script_path, "read.DEM.R", sep="/"))

# specify components 
cp_name <- c("pm10","o3","so2","no2","co")
cp_code <- c("5","7","1","8","10")
cp_period <- c("day", rep("hour", 4))

# ROMANIA
#system.time(
read.DEM(country="ro", cp_name, cp_code, cp_period)
#)
# 199.00 sec = 3 min 19 sec 


# SWITZERLAND
read.DEM(country="ch", cp_name, cp_code, cp_period)
# 52.40 sec 

# CYPRUS
read.DEM(country="cy", cp_name, cp_code, cp_period)
# 4.45 sec 
######################################################################


# BUILD SUITABLE DATAFRAMES 
######################################################################

# reshape data to long format with function long.DEM()
# from script long.DEM.R
source(paste(script_path, "long.DEM.R", sep="/"))

# ROMANIA
#system.time(
long.DEM(country="ro", cp_name, cp_code, cp_period)
#)
# 9.47 sec

# SWITZERLAND
long.DEM(country="ch", cp_name, cp_code, cp_period)
# 2.45 sec

# CYPRUS
long.DEM(country="cy", cp_name, cp_code, cp_period)
# 0.39 sec
######################################################################


# additional data from DEM with information about station type and location
# (not used in this analysis)
######################################################################

# read data from tables "station_type" and "station" to data frames
# type and loc

# load data
# for reading data from Databases
library(RODBC)
# (arbitrary country)
country <- "ro"  
dsn <- paste(country, "dem", sep="-")
channel <- odbcConnect(dsn=dsn)
type <- sqlFetch(channel, "station_type")
loc <- sqlFetch(channel, "station")
close(channel)

# select relevant columns
type <- type[,c(2,3,4,5)]
# type is now of format: sn_code - ts_code - st_startdate - st_enddate
loc <- loc[,c(2,5,18,19,20)]
# loc is now of format: sn_code - sn_eu_code - sn_longitude_d - sn_latitude_d - sn_altitude

type$sn_code <- as.factor(type$sn_code)
type$ts_code <- as.factor(type$ts_code)
#summary(type)
# at times multiple entries per station
#subset(type, type$sn_code=="1027")

loc$sn_code <- as.factor(loc$sn_code)
# NA coding (-9999 -> NA, 9999 -> NA) in sn_altitude
is.na(loc$sn_altitude) <- which(loc$sn_altitude==-9999)
is.na(loc$sn_altitude) <- which(loc$sn_altitude==9999)
#summary(loc)

# save data
save(type, file="./Data/2010 DEM Data/type")
save(loc, file="./Data/2010 DEM Data/loc")
######################################################################


#########################
# Break detection
#########################

######################################################################
#
# AirBase data, released 22Feb2012
# years 2000 to 2010
#
#
#########
#
# countries: [Germany(DE), The Netherlands(NL), Spain(ES)]
#            The Netherlands(NL), Czech Republic(CZ)    
#
# components(cp_number):  O3(00007), SO2(00001) hourly, PM10(00005) hourly[/daily];
#                        [NO2(00008), CO(00010) hourly[/monthly]]
#
# [how many files per component-type-country combination? example:]
# source(paste(script_path, "read.AirBase.R", sep="/"))
# length(read.filenames("00010","month","DE"))
#
#########
######################################################################

# set path
#getwd()
#"C:/Users/mrehr_01/Documents/ETC-ACM/subtask 1.0.1.2-5b/Scripts"
#set to "C:/Users/mrehr_01/Documents/ETC-ACM/subtask 1.0.1.2-5b"
path <- '../'
setwd(path)
# specify script path
script_path <- "./Scripts/load_and_organize_data"

# first download (and un-zip) AirBase_country_v6 directories from the EEA web page
# http://www.eea.europa.eu/data-and-maps/data/ds_resolveuid/FFAA9B8F-3379-457E-A60A-6491F4447724 
# into "./Data" 
# also directories for saving data have to be created beforehand:
# "./Data" : AirBase Data, "./Data/AirBase Data" : NL, CZ


## READ DATA ##
######################################################################

# read and store data with function read.AirBase() or component-wise with function import.AirBase()
# from script read.AirBase.R
# ! directories for saving dataframes have to be created beforehand
source(paste(script_path, "read.AirBase.R", sep="/"))

# specify components 
cp_name <- c("o3","so2","pm10","pm10")
cp_code <- c("00007","00001","00005","00005")
cp_period <- c("hour", "hour", "hour", "day")

# THE NETHERLANDS
#system.time(
read.AirBase(country="NL", cp_name, cp_code, cp_period)
#)
# 54.93 sec 

# CZECH REPUBLIC
read.AirBase(country="CZ", cp_name, cp_code, cp_period)
# 121.04 sec = 2 min 1.04 sec 
######################################################################


##  BUILD SUITABLE DATAFRAMES ##
######################################################################

# reshape data to long format with function long.AirBase()
# from script long.AirBase.R
source(paste(script_path, "long.AirBase.R", sep="/"))

# THE NETHERLANDS
#system.time(
long.AirBase(country="NL", cp_name, cp_code, cp_period)
#)
# 42.61 sec

# CZECH REPUBLIC
long.AirBase(country="CZ", cp_name, cp_code, cp_period)
# 75.26 sec = 1 min 15.26 sec
######################################################################


# additional data from AirBase with information about station type and location
# (not used in this analysis)
######################################################################

# read data from tables "AirBase_NL_v6_stations.csv" and "AirBase_CZ_v6_stations.csv" 
# to data frames NL_sns and CZ_sns

read.ABsns <- function(country){
  # read data from file "AirBase_country_v6_stations.csv"
  loc_sn <- paste("./Data/AirBase", country, "v6", sep="_")
  snFile <- paste("AirBase", country, "v6_stations.csv", sep="_")
  snData_all <- read.csv(file=paste(loc_sn,snFile,sep="/"), header=T, sep="\t") 
  # select variables
  sns <- snData_all[, c(1,5,13:15,8,10,9,6,7)]
  names(sns) <- c("sn_code", "sn_name", "long_deg", "lat_deg", "alt", "sn_type", "sn_area", 
                  "ozone", "start", "end")
  # save data
  name <- paste(country, "sns", sep="_")
  save(sns, file=paste("./Data/AirBase Data", country, name, sep="/"))
}

read.ABsns(country="NL")
read.ABsns(country="CZ")

######################################################################
######################################################################