--- title: "Project Metadata" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Project Metadata} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- The Wildlife Disease Data Standard is composed of two components - disease data and project metadata. Project metadata is a key component of WDDS because it makes it easier for others (including your future self) to find and re-use your data. In this vignette we will ingest project metadata from a CSV, restructure the data, and then create a json object. ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(wddsWizard) library(dplyr) library(stringr) ``` ## Required fields The following fields are required to be in the data. ```{r required fields, echo=FALSE, message=TRUE, warning=TRUE} required_fields <- wddsWizard::project_metadata_schema$properties[wddsWizard::project_metadata_required_fields] descriptions <- purrr::map_depth(required_fields, 1, "description") df_data_desc <- data.frame(Field = wddsWizard::project_metadata_required_fields, Descriptions = unlist(descriptions), row.names = NULL) df_data_desc |> dplyr::mutate(TDWG_url = dplyr::case_when( stringr::str_detect(string = Descriptions, pattern = "http://rs.tdwg.org/dwc/terms/") ~ stringr::str_extract(string = Descriptions, pattern = "http://rs.tdwg.org/dwc/terms/.*"), TRUE ~ "" )) |> dplyr::mutate(Field = case_when( TDWG_url != "" ~ kableExtra::cell_spec(Field, format = "html", link = TDWG_url), TRUE ~ Field )) |> dplyr::mutate(Descriptions = dplyr::case_when( stringr::str_detect(string = Descriptions, pattern = "http://rs.tdwg.org/dwc/terms/") ~ stringr::str_remove(string = Descriptions, pattern = "See http://rs.tdwg.org/dwc/terms/.*"), TRUE ~ Descriptions )) |> dplyr::select(-TDWG_url) |> kableExtra::kbl(escape = FALSE) |> kableExtra::kable_styling() ``` ## Load in the CSV and clean it up Our example data requires some light cleaning to make transforming it into csv easier. ```{r load-data} project_metadata <- wdds_example_data(version = "latest", file = "example_project_metadata.csv") |> read.csv() ## turn empty strings into NAs in the group field project_metadata <- project_metadata |> dplyr::mutate(Group = dplyr::case_when( Group != "" ~ Group, TRUE ~ NA )) ## use `fill` to complete the items column and `mutate` to make groups a little ## more ergonomic project_metadata_filled <- tidyr::fill(data = project_metadata, Group) ``` ## Restructure data The validation schema is expecting JSON, so we have to restructure the data into a list that can be converted to JSON. For Creators, Resources, and Funding References, its possible to have multiple entities in each group. In our example data, there are two creators and three funding references. So we need to pull out the `entity_id`s for the creators and funding references then clean up the `Group` field so it can be used a general category for Creators, Resources, and Funding References. ```{r restructure} # get ids for components of a group. project_metadata_ids <- project_metadata_filled |> dplyr::mutate( entity_id = stringr::str_extract(string = Group, pattern = "[0-9]{1,}"), # make sure that there are no NA entity IDs entity_id = dplyr::case_when( is.na(entity_id) ~ "1", TRUE ~ entity_id ) ) |> # drop entity ids from group field and convert to camel case dplyr::mutate( Group = stringr::str_replace_all(string = Group, pattern = " [0-9]{1,}", replacement = ""), Group = snakecase::to_lower_camel_case(Group) ) ## split dataframe by Group for further processing project_metadata_list <- split(project_metadata_ids, project_metadata_ids$Group) # The `get_entity` function creates standard entities that will be easier to transform json project_metadata_list_entities <- purrr::map(project_metadata_list, function(x) { x_typed <- dplyr::left_join(x, wddsWizard::schema_properties, by = c("Group" = "name")) |> dplyr::mutate(to_split = dplyr::case_when( is_array ~ TRUE, TRUE ~ FALSE )) if (all(!x_typed$to_split)) { out <- get_entity(x) return(out) } # # if (all(x$entity_id == "1")) { # out <- get_entity(x) # return(out) # } x_list <- split(x, x$entity_id) names(x_list) <- NULL out <- purrr::map(x_list, get_entity) return(out) }) ``` ## Make the json! In a simpler world - you could just run the following code and it would work. ```{r mockingbird, eval=FALSE} ## if only, if only the mockingbird sings jsonlite::toJSON(project_metadata_list_entities, pretty = TRUE, dataframe = "columns") ``` BUT because datacite's structures are more complex, we need to do some prep. Luckily, there are a host of prep functions that already exist in this package! These mostly tag list items with `jsonlite::unbox` and/or wrap things in lists so that when converted to json, they match the data standard's expected formats. ```{r prep-data} project_metadata_json <- prep_for_json(project_metadata_list_entities) |> jsonlite::toJSON(pretty = TRUE) ``` ## Validate Project metadata We can validate the entire project metadata object by using the `project_metadata.json` schema. ```{r validate-project, message=FALSE, warning=FALSE} schema <- wdds_json(version = "latest", file = "schemas/project_metadata.json") project_validator <- jsonvalidate::json_validator(schema, engine = "ajv") project_validation <- project_validator(project_metadata_json, verbose = TRUE) ## check for errors! errors <- attributes(project_validation) if (!project_validation) { errors$errors } else { print("Valid project metadata!😁") } ``` # What if we just use the project metadata template? Great question! If you use the project metadata template, then you can do the following. ```{r} project_metadata <- wdds_example_data(version = "latest", file = "example_project_metadata.csv") |> read.csv() project_metadata_prepped <- prep_from_metadata_template(project_metadata) |> jsonlite::toJSON(pretty = TRUE) ```