# Initialize list to store processed datasets from each KR file
all_countries_raw <- list()
# Loop through each DHS dataset
for(i in 1:length(dhs)){
# Extract current dataset
dhs_hhs = dhs[i]
# Select relevant variables from the Kids Recode files:
# - caseid: unique case identifier
# - v000: country code and phase
# - v001: cluster number
# - v002: household number
# - v003: respondent's line number
# - v007: year of interview
# - v012: respondent's age
# - v101: region (often used as backup for v024)
# - v02*: household characteristics (v021, v022, v024, v025)
# - v005: sample weight
# - b5: child alive (Yes/No)
# - b*: birth history variables
# - b8: current age of child in months
# - h*: health variables (fever, treatment, etc.)
# - h22: fever in last 2 weeks
# - h32*: fever treatment source variables
dhs_kr <- dhs_hhs %>%
purrr::map(~dplyr::select(., caseid, v000, v001, v002, v003, v007, v012, v101,
contains('v02'), v005, b5, contains('b'), b8,
contains('h'), contains("h32")))
## Process each dataset in the current KR file
dhs_data = data.frame()
for(j in 1:length(dhs_kr)){
dhs_hh = dhs_kr[j]
# Convert list to data frame using plyr::ldply
all_datasets = ldply(dhs_hh, data.table)
# Error handling for variable harmonization
tryCatch({
# Case 1: If age variable 'b19' (age in months) exists, use it directly
if('b19' %in% colnames(all_datasets)){
all_datasets = all_datasets %>%
dplyr::select(v000, v001, v002, v003, v007, v005, v012, v101, v021,
v024 = v101, # Rename v101 to v024 for consistency
v022, v025, b5, b8, h22, b19, contains("h32")) %>%
dplyr::mutate(region = ifelse(is.na(v024), v101, v024))
# Case 2: If 'hw1' exists, use it as proxy for age
} else if('hw1' %in% colnames(all_datasets)){
all_datasets = all_datasets %>%
dplyr::select(v000, v001, v002, v003, v007, v005, v012,
v024 = v101, # Rename v101 to v024
v101, v021, v022, v025, b5, b8, h22, hw1, contains("h32")) %>%
dplyr::mutate(b19 = hw1, # Create b19 from hw1
region = ifelse(is.na(v024), v101, v024))
}
# Append harmonized dataset to dhs_data
{
dhs_data = list(bind_rows(dhs_data, all_datasets))
}
}, error = function(e){
# Print error message if harmonization fails
cat("ERROR :", conditionMessage(e), "\n")
})
}
# Store harmonized dataset for this KR file
all_countries_raw[[i]] <- dhs_data
}
# Combine all KR datasets into one large harmonized data frame
all_datasets_final <- dplyr::bind_rows(all_countries_raw)
# ============================================================================
# STANDARDIZE REGION NAMES USING LOOKUP TABLE
# ============================================================================
# Load region standardization lookup table from repository
# (See: lookup/dhs_region_standardization.csv for complete mappings)
region_lookup <- read.csv('lookup/dhs_region_standardization.csv',
stringsAsFactors = FALSE)
# Apply standardization via left join
# Handles both year-specific mappings (e.g., Angola, Liberia)
# and year-independent mappings (e.g., Guinea, Benin)
all_datasets_final <- all_datasets_final %>%
left_join(region_lookup, by = c("country_name" = "country_name",
"year" = "year",
"v024" = "v024")) %>%
dplyr::mutate(
# Use standardized region name if available, otherwise keep v024 code
region_name = coalesce(region_name, as.character(v024))
)