rOpenGov · pitkant · Oct 20, 2023 · Sep 29, 2023 · Sep 29, 2023 · Oct 2, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Type: Package
 Package: eurostat
 Title: Tools for Eurostat Open Data
-Version: 4.0.0.9004
-Date: 2023-08-15
+Version: 4.0.0.9006
+Date: 2023-10-20
 Authors@R: c(
     person("Leo", "Lahti", , "leo.lahti@iki.fi", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-5537-637X")),
@@ -31,7 +31,7 @@ URL: https://ropengov.github.io/eurostat/,
     https://github.com/rOpenGov/eurostat
 BugReports: https://github.com/rOpenGov/eurostat/issues
 Depends:
-    R (>= 3.5.0)
+    R (>= 3.6.0)
 Imports:
     classInt,
     countrycode,
@@ -51,7 +51,8 @@ Imports:
     stringr,
     tibble,
     tidyr (>= 1.0.0),
-    xml2
+    xml2,
+    data.table (>= 1.14.8)
 Suggests:
     giscoR,
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,11 @@ importFrom(RefManageR,toBiblatex)
 importFrom(classInt,classIntervals)
 importFrom(countrycode,countrycode)
 importFrom(curl,curl_download)
+importFrom(data.table,":=")
+importFrom(data.table,.SD)
+importFrom(data.table,fread)
+importFrom(data.table,melt)
+importFrom(data.table,setDT)
 importFrom(digest,digest)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,case_when)
@@ -69,6 +74,7 @@ importFrom(regions,validate_geo_code)
 importFrom(regions,validate_nuts_regions)
 importFrom(rlang,"!!")
 importFrom(rlang,sym)
+importFrom(stats,na.omit)
 importFrom(stringi,stri_extract_first_regex)
 importFrom(stringi,stri_replace_all_fixed)
 importFrom(stringi,stri_replace_all_regex)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,25 @@
+# eurostat 4.0.0.9006
+
+## Bug fixes
+
+* Added a more informatic warning message in situations where TOC datasets downloaded from Eurostat might not have proper titles. For some reason this was isolated to German and French language versions of TOC while English language TOC had proper titles for all items. 
+
+## Minor updates
+
+* Added suppressWarnings() to some of the tests that use TOC's directly or indirectly as the tests are not directly related to TOC files.
+* Added a new internal function `clean_eurostat_toc()` for easy removal of TOC objects from .EurostatEnv environment.
+
+# eurostat 4.0.0.9005
+
+## Major updates
+
+* Add data.table to package Imports and make using data.table functions optional with `get_eurostat()` `use.data.table` argument. This is especially useful with big datasets that would otherwise take a long time to go through the different data cleaning functions or crash R with their large memory footprint. (issue #277, PR #278)
+
 # eurostat 4.0.0.9004
 
 ## Major updates
 
-* switch from `httr` package to `httr2`
+* switch from `httr` package to `httr2` (issue #273, PR #276)
 
 # eurostat 4.0.0.9003
 

diff --git a/R/eurostat-package.R b/R/eurostat-package.R
@@ -309,6 +309,30 @@
 #' See also section "Eurostat: Copyright notice and free re-use of data"
 #' in [get_eurostat()] documentation.
 #' 
+#' # Strategies for handling large datasets more efficiently
+#' 
+#' Most Eurostat datasets are relatively manageable, at least on a machine
+#' with 16 GB of RAM. The largest dataset in Eurostat database, at the time
+#' of writing this, had 148362539 (148 million) values, which results in an
+#' object with 148 million rows in tidy data (long) format. The test machine
+#' with 16 GB of RAM was able to handle the second largest dataset in the
+#' database with 91 million values (rows).
+#' 
+#' There are still some methods to make data fetching 
+#' functions perform faster:
+#' 
+#' * turn caching off: `get_eurostat(cache = FALSE)`
+#' * turn cache compression off (may result in rather large cache files!): 
+#' `get_eurostat(compress_file = FALSE)`
+#' * if you want faster caching with manageable file sizes, use stringsAsFactors: 
+#' `get_eurostat(cache = TRUE, compress_file = TRUE, stringsAsFactors = TRUE)`
+#' * Use faster data.table functions: `get_eurostat(use.data.table = TRUE)`
+#' * Keep column processing to a minimum: 
+#' `get_eurostat(time_format = "raw", type = "code")` etc.
+#' * Read `get_eurostat()` function documentation carefully so you understand
+#' what different arguments do
+#' * Filter the dataset so that you fetch only the parts you need!
+#' 
 #' @examples library(eurostat)
 #' @section regions functions:
 #' For working with sub-national statistics the basic functions of the

diff --git a/R/get_eurostat.R b/R/get_eurostat.R
@@ -63,6 +63,8 @@
 #' Also possible non-real zero "0n" is indicated in flags column.
 #' Flags are not available for eurostat API, so `keepFlags`
 #' can not be used with a `filters`.
+#' @param use.data.table Use faster data.table functions? Default is FALSE. 
+#' On Windows requires that RTools is installed.
 #' @inheritDotParams get_eurostat_json
 #' 
 #' @inherit eurostat-package references
@@ -71,6 +73,7 @@
 #' @inheritSection eurostat-package Filtering datasets
 #' @inheritSection eurostat-package Citing Eurostat data
 #' @inheritSection eurostat-package Disclaimer: Availability of filtering functionalities
+#' @inheritSection eurostat-package Strategies for handling large datasets more efficiently
 #'
 #' @author
 #' Przemyslaw Biecek, Leo Lahti, Janne Huovari, Markus Kainu and Pyry Kantanen
@@ -182,6 +185,7 @@ get_eurostat <- function(id,
                          compress_file = TRUE,
                          stringsAsFactors = FALSE,
                          keepFlags = FALSE,
+                         use.data.table = FALSE,
                          ...) {
 
   # Check if you have access to ec.europe.eu.
@@ -406,24 +410,25 @@ get_eurostat <- function(id,
       # If filters value is NULL
       #   -> Download from SDMX 2.1 REST API (replaces old "Bulk download")
 
-      y_raw <- try(get_eurostat_raw(id), silent = TRUE)
-      if ("try-error" %in% class(y_raw)) {
+      y <- try(get_eurostat_raw(id, use.data.table = use.data.table), silent = TRUE)
+      if ("try-error" %in% class(y)) {
         stop(paste("get_eurostat_raw fails with the id", id))
       }
 
       # If download from SDMX 2.1 REST API is successful
       #   -> tidy the dataset with tidy_eurostat function
 
       y <- tidy_eurostat(
-        y_raw,
+        y,
         time_format,
         select_time,
         stringsAsFactors = stringsAsFactors,
-        keepFlags = keepFlags
+        keepFlags = keepFlags, use.data.table = use.data.table
       )
 
       if (identical(type, "code")) {
-        y <- y
+        # do nothing
+        # y <- y
       } else if (identical(type, "label")) {
         y <- label_eurostat(y, lang)
       } else if (identical(type, "both")) {
@@ -439,12 +444,12 @@ get_eurostat <- function(id,
     # situations the cached file could go missing? Not very likely though
 
     message(paste("Reading cache file", cache_file_bulk, "and filtering it"))
-    y_raw <- readRDS(cache_file_bulk)
+    y <- readRDS(cache_file_bulk)
     for (i in seq_along(filters)) {
-      y_raw <- dplyr::filter(y_raw,
+      y <- dplyr::filter(y,
                              !!rlang::sym(names(filters)[i]) == filters[i])
     }
-    y <- y_raw
+    # y <- y_raw
   } else if (file.exists(cache_file)) {
     cf <- path.expand(cache_file)
     message(paste("Reading cache file", cf))

diff --git a/R/get_eurostat_raw.R b/R/get_eurostat_raw.R
@@ -31,9 +31,10 @@
 #' @importFrom utils download.file
 #' @importFrom tibble as_tibble
 #' @importFrom curl curl_download
+#' @importFrom data.table fread
 #'
 #' @keywords utilities database
-get_eurostat_raw <- function(id) {
+get_eurostat_raw <- function(id, use.data.table = FALSE) {
   base <- getOption("eurostat_url")
 
   url <- paste0(
@@ -50,29 +51,31 @@ get_eurostat_raw <- function(id) {
     curl::curl_download(url = url, destfile = tfile)
   } else {
     # R Packages (2e): Restore state with base::on.exit()
-    # Use timeout = 90 for bigger datasets
-    op <- options(timeout = 90)
+    # timeout = 120 should in most cases be enough for even the biggest datasets
+    op <- options(timeout = 120)
     on.exit(options(op), add = TRUE)
     utils::download.file(url, tfile)
   }
-
-  # OLD CODE
-  dat <- readr::read_tsv(gzfile(tfile),
-    na = ":",
-    col_types = readr::cols(.default = readr::col_character())
-  )
-
-  # NEW CODE: data.table
-  # dat <- data.table::fread(cmd = paste("gzip -dc", tfile),
-  #                          na.strings = ":",
-  #                          colClasses = "character")
-  # The reason why data.table is not currently used is that readr::cols
-  # and readr::col_character() worked better with some datasets
-  # and because RAM usage was not that much lower with data.table
-
-  # OLD CODE
-  dat <- tibble::as_tibble(dat)
-
+
+  if (!use.data.table) {
+    # OLD CODE
+    dat <- readr::read_tsv(gzfile(tfile),
+      na = ":",
+      progress = TRUE,
+      col_types = readr::cols(.default = readr::col_character())
+    )
+  } else if (use.data.table) {
+    # NEW CODE: data.table
+    dat <- data.table::fread(cmd = paste("gzip -dc", tfile),
+                             na.strings = ":",
+                             header = TRUE,
+                             colClasses = "character")
+
+    # OLD CODE
+    # data.table object does not need to be converted into a tibble at this 
+    # point as it will handled by data.table functions in tidy_eurostat.    
+    # dat <- tibble::as_tibble(dat)
+  }
 
   # check validity
   if (ncol(dat) < 2 || nrow(dat) < 1) {

diff --git a/R/get_eurostat_toc.R b/R/get_eurostat_toc.R
@@ -74,3 +74,9 @@ get_eurostat_toc <- function(lang = "en") {
 
   invisible(get(language_version, envir = .EurostatEnv))
 }
+
+clean_eurostat_toc <- function() {
+  objects_in_env <- objects(envir = .EurostatEnv, all.names = TRUE)
+  toc_objects_in_env <- objects_in_env[grep(".eurostatTOC", objects_in_env)]
+  remove(list = toc_objects_in_env, envir = .EurostatEnv)
+}
diff --git a/R/set_eurostat_toc.R b/R/set_eurostat_toc.R
@@ -35,6 +35,7 @@ set_eurostat_toc <- function(lang = "en") {
 
     .eurostatTOC$hierarchy <- toc_determine_hierarchy(.eurostatTOC$title)
     .eurostatTOC$title <- trimws(.eurostatTOC$title, which = "left")
+    .eurostatTOC$values <- as.numeric(.eurostatTOC$values)
 
     assign(language_version, .eurostatTOC, envir = .EurostatEnv)
   }
@@ -99,13 +100,28 @@ toc_determine_hierarchy <- function(input_string) {
 
   # If all x mod y calculations equal 0 everything is ok. 
   # If not, input is somehow mangled
+  # For example "    General and regional statistics" (4 whitespace) returns 1
+  # whereas "            " (12 whitespace without any letters) returns also 1
+  # Normally all dataset items are expected to have a title to determine
+  # their place in hierarchy. Testing for this might be a bit tricky.
   if (!all((number_of_whitespace %% 4) %in% c(0))) {
-    warning("Mangled input")
-    return(invisible())
+    warning(
+      paste(
+      "TOC indentation was not uniform in all rows or there were some",
+      "items that were missing a proper title. Hierarchy value set to NA",
+      "for problematic rows."
+        )
+      )
+    invalid_rows <- which(!(number_of_whitespace %% 4) %in% c(0))
+    # return(invisible())
+    hierarchy <- number_of_whitespace %/% 4
+    hierarchy[invalid_rows] <- NA
+    return(hierarchy)
   }
 
   # If white space is 0, it gets number 0 in hierarchy
-  (number_of_whitespace %/% 4)
+  hierarchy <- number_of_whitespace %/% 4
+  hierarchy
   # Or should it be 1?
   # (number_of_whitespace %/% 4) + 1
 }