Validate 'riskintro' Datasets — validate_table

Validates the datasets with 'riskintro' rules. The function checks the required and optional columns, and validates the data using the rules defined in the specifications.

Usage

validate_table_content(x, table_name, ...)

Arguments

x

A data frame or an 'sf' object to be validated.

table_name

A character string specifying the name of the dataset. It accepts one of the following values:

"animal_mobility"
"epi_units"
"entry_points"
"emission_risk_factors"

...

Additional arguments to be passed to the function. It is expected to be a named list of columns to be renamed in the dataset. For example, col1 = "new_col1", col2 = "new_col2".

Value

A list containing the validation status of the dataset. The list contains the following elements:

required_columns: A list with the status of required columns.
optional_columns: A list with the status of optional columns.
validate_rules: A list with the status of validation rules.
dataset: The dataset after renaming and selecting the specified columns. If validation fails, this element will be NULL, not available.

Details

The function checks if the dataset contains the required and optional columns as specified in the specifications. It also validates the data using the rules defined in the specifications. If errors are met or validity rules are not satisfied, the function returns a list with the status of the validation.

Examples

## ---- read-epi-units-tunisia ----
tun_epi_files <-
  system.file(
    package = "riskintrodata",
    "samples",
    "tunisia",
    "epi_units", "tunisia_adm2_raw.gpkg"
  )

tun_epi_unit <- read_geo_file(tun_epi_files)

DATA_EPI_UNITS <- validate_table_content(
  x = tun_epi_unit,
  table_name = "epi_units",
  eu_name = "shapeName",
  user_id = "fid"
)

DATA_EPI_UNITS
#> $table_name
#> [1] "epi_units"
#> 
#> $required_columns
#> $chk
#> [1] FALSE
#> 
#> $msg
#> The following required columns are missing: `eu_name` and `geometry`
#> 
#> $details
#> [1] "eu_name"  "geometry"
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $optional_columns
#> $chk
#> [1] TRUE
#> 
#> $msg
#> [1] "Optional columns selected are available."
#> 
#> $details
#> character(0)
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $validate_rules
#> $chk
#> [1] FALSE
#> 
#> $msg
#> [1] "Found invalidities while checking dataset."
#> 
#> $details
#> # A tibble: 4 × 8
#>   colname  valid required column_found n     index value msg                    
#>   <chr>    <lgl> <lgl>    <lgl>        <lgl> <lgl> <lgl> <glue>                 
#> 1 eu_id    TRUE  FALSE    TRUE         NA    NA    NA    "eu_id" has been valid…
#> 2 eu_id    TRUE  FALSE    TRUE         NA    NA    NA    "eu_id" has been valid…
#> 3 eu_name  FALSE TRUE     FALSE        NA    NA    NA    Column: "eu_name" is m…
#> 4 geometry FALSE TRUE     FALSE        NA    NA    NA    Column: "geometry" is …
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $dataset
#> Simple feature collection with 268 features and 0 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: 7.530076 ymin: 30.23681 xmax: 11.59826 ymax: 37.55986
#> Geodetic CRS:  WGS 84
#> # A tibble: 268 × 1
#>                                                                             geom
#>                                                               <MULTIPOLYGON [°]>
#>  1 (((10.13861 36.89453, 10.14495 36.89476, 10.15127 36.89476, 10.1576 36.89235…
#>  2 (((10.05585 36.84308, 10.06575 36.85019, 10.07327 36.8544, 10.07366 36.85451…
#>  3 (((10.13862 36.89416, 10.1329 36.88994, 10.13283 36.88892, 10.1326 36.88572,…
#>  4 (((10.1317 36.88428, 10.1317 36.88271, 10.1317 36.8797, 10.12929 36.87579, 1…
#>  5 (((10.16651 36.88694, 10.16422 36.88874, 10.1576 36.89235, 10.15127 36.89476…
#>  6 (((10.27118 36.88874, 10.26842 36.88874, 10.26149 36.88783, 10.25577 36.8863…
#>  7 (((10.01018 37.00285, 10.0102 37.00285, 10.01045 37.00283, 10.01063 37.00281…
#>  8 (((10.19313 36.85656, 10.19313 36.85892, 10.19313 36.86404, 10.19313 36.8667…
#>  9 (((9.141866 36.86897, 9.140129 36.86767, 9.137473 36.86604, 9.133329 36.8623…
#> 10 (((9.086732 36.70221, 9.082556 36.70772, 9.078131 36.71146, 9.075724 36.7131…
#> # ℹ 258 more rows
#> 
#> attr(,"class")
#> [1] "table_validation_status"
## ---- read-animal-mobility-tunisia ----
tun_animal_mobility <-
  system.file(
    package = "riskintrodata",
    "samples",
    "tunisia",
    "animal_mobility", "ANIMAL_MOBILITY_raw.csv"
  )

x <- readr::read_csv(
  tun_animal_mobility,
)
#> Rows: 112 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): ORIGIN_NAME, ORIGIN_COUNTRY, ORIGIN_ISO3, DESTINATION_NAME, DESTINA...
#> dbl (5): ORIGIN_LONGITUDE_X, ORIGIN_LATITUDE_Y, DESTINATION_LONGITUDE_X, DES...
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

DATA_ANIMAL_MOBILITY <- apply_mapping(
  x,
  mapping = mapping_animal_mobility(
    o_name = "ORIGIN_NAME",
    o_lng = "ORIGIN_LONGITUDE_X",
    o_lat = "ORIGIN_LATITUDE_Y",
    d_name = "DESTINATION_NAME",
    d_lng = "DESTINATION_LONGITUDE_X",
    d_lat = "DESTINATION_LATITUDE_Y",
    quantity = "HEADCOUNT"
  ),
  validate = TRUE
)
#> ✔ All data in "animal_mobility" valided.

DATA_ANIMAL_MOBILITY
#> # A tibble: 112 × 12
#>    animal_mobility_id o_name   o_lng o_lat d_name d_lng d_lat quantity o_country
#>    <chr>              <chr>    <dbl> <dbl> <chr>  <dbl> <dbl>    <dbl> <chr>    
#>  1 am-00001           AEROP…  -4.08   14.5 AEROP… 10.8   33.9     2217 Mali     
#>  2 am-00002           AEROP… -17.5    14.7 AEROP… 10.7   34.7     5343 Senegal  
#>  3 am-00003           SENGH… -13.2    14.6 POINT…  7.66  33.5     7798 Senegal  
#>  4 am-00004           TESSA…   0.706  20.4 POINT… 10.7   32.0     9519 Mali     
#>  5 am-00005           PERUV… -77.1   -12.0 PORT …  9.89  37.3     6133 Peru     
#>  6 am-00006           BAHIA… -62.3    48.9 PORT … 10.1   33.9     1785 Canada   
#>  7 am-00007           AINZE…   8.26   35.6 BOUJA…  8.49  35.7     2693 Algeria  
#>  8 am-00008           AINZE…   8.26   35.6 BOUJA…  8.49  35.7     6307 Algeria  
#>  9 am-00009           AINZE…   8.26   35.6 BOUJA…  8.49  35.7     4791 Algeria  
#> 10 am-00010           BOUGO…   8.36   36.7 HALIMA 10.0   36.4     7299 Algeria  
#> # ℹ 102 more rows
#> # ℹ 3 more variables: o_iso3 <chr>, d_country <chr>, d_iso3 <chr>
# read Tunisia emission risk factors dataset ----
tun_erf_file <-
  system.file(
    package = "riskintrodata",
    "samples",
    "tunisia",
    "emission_risk_factor",
    "emission_risk_factors.csv"
  )
x <- read_emission_risk_factor_file(tun_erf_file)

DATA_ERF <- validate_table_content(x, table_name = "emission_risk_factors")

DATA_ERF
#> $table_name
#> [1] "emission_risk_factors"
#> 
#> $required_columns
#> $chk
#> [1] TRUE
#> 
#> $msg
#> [1] "All required columns selected."
#> 
#> $details
#> character(0)
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $optional_columns
#> $chk
#> [1] TRUE
#> 
#> $msg
#> [1] "No optional columns selected."
#> 
#> $details
#> character(0)
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $validate_rules
#> $chk
#> [1] TRUE
#> 
#> $msg
#> 19 valid rules checking dataset.
#> 
#> $details
#> # A tibble: 19 × 8
#>    colname                   valid required column_found n     index value msg  
#>    <chr>                     <lgl> <lgl>    <lgl>        <lgl> <lgl> <lgl> <glu>
#>  1 iso3                      TRUE  TRUE     TRUE         NA    NA    NA    "iso…
#>  2 iso3                      TRUE  TRUE     TRUE         NA    NA    NA    "iso…
#>  3 country                   TRUE  TRUE     TRUE         NA    NA    NA    "cou…
#>  4 animal_category           TRUE  TRUE     TRUE         NA    NA    NA    "ani…
#>  5 animal_category           TRUE  TRUE     TRUE         NA    NA    NA    "ani…
#>  6 species                   TRUE  TRUE     TRUE         NA    NA    NA    "spe…
#>  7 species                   TRUE  TRUE     TRUE         NA    NA    NA    "spe…
#>  8 disease_notification      TRUE  TRUE     TRUE         NA    NA    NA    "dis…
#>  9 targeted_surveillance     TRUE  TRUE     TRUE         NA    NA    NA    "tar…
#> 10 general_surveillance      TRUE  TRUE     TRUE         NA    NA    NA    "gen…
#> 11 screening                 TRUE  TRUE     TRUE         NA    NA    NA    "scr…
#> 12 precautions_at_the_borde… TRUE  TRUE     TRUE         NA    NA    NA    "pre…
#> 13 slaughter                 TRUE  TRUE     TRUE         NA    NA    NA    "sla…
#> 14 selective_killing_and_di… TRUE  TRUE     TRUE         NA    NA    NA    "sel…
#> 15 zoning                    TRUE  TRUE     TRUE         NA    NA    NA    "zon…
#> 16 official_vaccination      TRUE  TRUE     TRUE         NA    NA    NA    "off…
#> 17 last_outbreak_end_date    TRUE  TRUE     TRUE         NA    NA    NA    "las…
#> 18 commerce_illegal          TRUE  TRUE     TRUE         NA    NA    NA    "com…
#> 19 commerce_legal            TRUE  TRUE     TRUE         NA    NA    NA    "com…
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $dataset
#> # A tibble: 65 × 18
#>    iso3  country            disease animal_category species disease_notification
#>    <chr> <chr>              <chr>   <chr>           <chr>                  <int>
#>  1 ALB   Albania            Brucel… Domestic        Cattle                     0
#>  2 BWA   Botswana           Brucel… Domestic        Cattle                     1
#>  3 CYM   Cayman Islands     Brucel… Domestic        Cattle                     0
#>  4 JPN   Japan              Brucel… Domestic        Cattle                     0
#>  5 SYC   Seychelles         Brucel… Domestic        Cattle                     0
#>  6 SYR   Syria              Brucel… Domestic        Cattle                     0
#>  7 PSE   Palestine          Brucel… Domestic        Cattle                     0
#>  8 USA   United States of … Brucel… Domestic        Cattle                     0
#>  9 ARM   Armenia            Brucel… Domestic        Cattle                     0
#> 10 AZE   Azerbaijan         Brucel… Domestic        Cattle                     0
#> # ℹ 55 more rows
#> # ℹ 12 more variables: targeted_surveillance <int>, general_surveillance <int>,
#> #   screening <int>, precautions_at_the_borders <int>, slaughter <int>,
#> #   selective_killing_and_disposal <int>, zoning <int>,
#> #   official_vaccination <int>, last_outbreak_end_date <date>,
#> #   commerce_illegal <int>, commerce_legal <int>, data_source <chr>
#> 
#> $dataset_changes
#> character(0)
#> 
#> attr(,"class")
#> [1] "table_validation_status"
# read entry points data from Tunisia ----
tun_entry_points <- system.file(
  package = "riskintrodata",
  "samples",
  "tunisia",
  "entry_points",
  "BORDER_CROSSING_POINTS.csv"
)

x <- readr::read_delim(
  tun_entry_points
)
#> Rows: 110 Columns: 6
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (4): NAME, TYPE, MODE, SOURCES
#> dbl (2): LONGITUDE_X, LATITUDE_Y
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

DATA_ENTRY_POINTS <- validate_table_content(
  x = x,
  table_name = "entry_points",
  point_name = "NAME",
  lng = "LONGITUDE_X",
  lat = "LATITUDE_Y",
  mode = "MODE",
  type = "TYPE"
)

DATA_ENTRY_POINTS
#> $table_name
#> [1] "entry_points"
#> 
#> $required_columns
#> $chk
#> [1] TRUE
#> 
#> $msg
#> [1] "All required columns selected."
#> 
#> $details
#> character(0)
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $optional_columns
#> $chk
#> [1] TRUE
#> 
#> $msg
#> [1] "Optional columns selected are available."
#> 
#> $details
#> character(0)
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $validate_rules
#> $chk
#> [1] TRUE
#> 
#> $msg
#> 10 valid rules checking dataset.
#> 
#> $details
#> # A tibble: 10 × 8
#>    colname    valid required column_found n     index value msg                 
#>    <chr>      <lgl> <lgl>    <lgl>        <lgl> <lgl> <lgl> <glue>              
#>  1 point_name TRUE  TRUE     TRUE         NA    NA    NA    "point_name" has be…
#>  2 point_name TRUE  TRUE     TRUE         NA    NA    NA    "point_name" has be…
#>  3 lng        TRUE  TRUE     TRUE         NA    NA    NA    "lng" has been vali…
#>  4 lng        TRUE  TRUE     TRUE         NA    NA    NA    "lng" has been vali…
#>  5 lng        TRUE  TRUE     TRUE         NA    NA    NA    "lng" has been vali…
#>  6 lat        TRUE  TRUE     TRUE         NA    NA    NA    "lat" has been vali…
#>  7 lat        TRUE  TRUE     TRUE         NA    NA    NA    "lat" has been vali…
#>  8 lat        TRUE  TRUE     TRUE         NA    NA    NA    "lat" has been vali…
#>  9 mode       TRUE  FALSE    TRUE         NA    NA    NA    "mode" has been val…
#> 10 type       TRUE  FALSE    TRUE         NA    NA    NA    "type" has been val…
#> 
#> attr(,"class")
#> [1] "validation_status"
#> 
#> $dataset
#> Simple feature collection with 110 features and 4 fields
#> Geometry type: POINT
#> Dimension:     XY
#> Bounding box:  xmin: 7.572541 ymin: 31.94455 xmax: 11.59319 ymax: 37.26487
#> Geodetic CRS:  WGS 84
#> # A tibble: 110 × 5
#>    point_id point_name             mode  type             geometry
#>  * <chr>    <chr>                  <chr> <chr>         <POINT [°]>
#>  1 ep-00001 aeroport Djerba        C     AIR   (10.77592 33.87149)
#>  2 ep-00002 aeroport enfidha       C     AIR   (10.43123 36.07011)
#>  3 ep-00003 aeroport monastir      C     AIR   (10.75472 35.75806)
#>  4 ep-00004 aeroport sfax          C     AIR   (10.68861 34.72056)
#>  5 ep-00005 aeroport tabarka       C     AIR    (8.87528 36.98028)
#>  6 ep-00006 Aeroport tozeur        C     AIR    (8.10139 33.93889)
#>  7 ep-00007 aeroport tunis cathage C     AIR   (10.22694 36.85111)
#>  8 ep-00001 aeroport Djerba        C     AIR   (10.77592 33.87149)
#>  9 ep-00002 aeroport enfidha       C     AIR   (10.43123 36.07011)
#> 10 ep-00003 aeroport monastir      C     AIR   (10.75472 35.75806)
#> # ℹ 100 more rows
#> 
#> $dataset_changes
#> [1] "\"Entry points\" `lat` and `lng` table converted to type POINT_sf"
#> 
#> attr(,"class")
#> [1] "table_validation_status"