6 Case Study: French-Severn Forest

Now that we have walked through the segmentation and imputation processes, we can do an additional case study by applying the same algorithms and code to the French-Severn Forest in the Great Lakes region of Ontario. Instead of separating the code into distinct sections, we will run the entire segmentation and imputation codes as single chunks and present results.

6.1 Segmentation

6.1.1 Run Algorithm

##################################
### INSTALL PACKAGES IF NEEDED ###
##################################

# install.packages(c('terra',
#                    'tidyverse',
#                    'exactextractr',
#                    'sf',
#                    'janitor',
#                    'berryFunctions',
#                    'lwgeom',
#                    'magrittr',
#                    'gridExtra',
#                    'knitr'))

# make sure to have OTB installed from here:
# https://www.orfeo-toolbox.org/

#####################
### LOAD PACKAGES ###
#####################

# load packages
library(terra)
library(tidyverse)
library(exactextractr)
library(sf)
library(janitor)
library(berryFunctions)
library(lwgeom)
library(magrittr)
library(gridExtra)
library(knitr)

####################################
### SET CODE AND FILE PARAMETERS ###
####################################

# set file names for ALS input variables
# these should be gridded raster data with the same CRS and extent
# p95 = 95th percentile of returns height > 1.3 m
# cc = canopy cover (% of firest returns > 2 m)
# cv = coefficient of variation of returns height > 1.3 m
p95_f <- 'D:/ontario_inventory/FSF/ALS/zq95.img'
cc_f <- 'D:/ontario_inventory/FSF/ALS/cov_2m.img'
cv_f <- 'D:/ontario_inventory/FSF/ALS/cv.img'

# set file location of roads shape file (spatial lines)
# roads will be polygonized, masked from segmentation
# and re-added to final dataset as polygons
roads_f <-
  'D:/ontario_inventory/FSF/roads/FS_roads.shp'

# set file location of FRI polygons shape file
# FRI POLYTYPE should have a "WAT" classification to mask water polygons
fri <- 'D:/ontario_inventory/FSF/FRI/FSF_opi_polygon_CSRS_NAD83_17.shp'

# set output folder for files generated
# make sure no "/" at end of folder location!
out_dir <- 'C:/Users/bermane/Desktop/FSF'

# set folder location of OTB (where you installed OTB earlier)
otb_dir <- "C:/OTB/bin"

# set GRM segmentation parameters
# the default are listed below
# refer to paper or OTB GRM webpage for description of parameters
thresh <- "10"
spec <- "0.1"
spat <- "0.5"

# set file location of 2018 VLCE 2.0 landcover data
# using 2018 because it is the year of Romeo ALS acquisition
# can change based on ALS acquisition year
# download here:
# https://opendata.nfis.org/mapserver/nfis-change_eng.html
lc_f <- 'D:/ontario_inventory/VLCE/CA_forest_VLCE2_2018.tif'

###################################################
### LOAD MULTI BAND ALS RASTER FOR SEGMENTATION ###
###################################################

# stack rasters
spl <- rast(c(p95_f, cc_f, cv_f))

# apply smoothing function on 5 cell square
spl[[1]] <- focal(spl[[1]], w = 5, fun = "mean")
spl[[2]] <- focal(spl[[2]], w = 5, fun = "mean")
spl[[3]] <- focal(spl[[3]], w = 5, fun = "mean")

# create ALS template with all values equal to 1
spl_temp <- spl[[1]]
spl_temp[] <- 1

##################
### MASK ROADS ###
##################

# load roads layer
roads <- vect(roads_f)

# reproject to match lidar
roads <- project(roads, spl)

# create roads polygon
spl_r <- mask(spl_temp, roads, touches = T)
npix <- sum(values(spl_r), na.rm = T)
spl_r <- as.polygons(spl_r)
names(spl_r) <- 'POLYTYPE'
spl_r$POLYTYPE <- 'RDS'
spl_r$nbPixels <- npix

# mask road pixels to NA
spl <- spl %>%
  mask(., roads, inverse = T, touches = T)

###########################
### MASK WATER POLYGONS ###
###########################

# water polygons from the FRI are masked and re-added after segmentation

# load photo interpreted polygons
poly <- vect(fri)

# subset polygons that are WAT
poly_sub <- poly[poly$POLYTYPE %in% c('WAT')]

# reproject to match lidar
poly_sub <- project(poly_sub, spl)

# loop through water polygons, mask raster, and vectorize
for (i in 1:length(poly_sub)) {
  pt <- poly_sub$POLYTYPE[i]
  if (i == 1) {
    spl_pt <- spl_temp %>% crop(poly_sub[i], snap = 'out') %>%
      mask(poly_sub[i], touches = T)
    npix <- sum(values(spl_pt), na.rm = T)
    spl_pt <- as.polygons(spl_pt)
    names(spl_pt) <- 'POLYTYPE'
    spl_pt$POLYTYPE <- pt
    spl_pt$nbPixels <- npix
  } else{
    if (is.error(spl_temp %>% crop(poly_sub[i], snap = 'out') %>%
                 mask(poly_sub[i], touches = T)) == F) {
      spl_hold <- spl_temp %>% crop(poly_sub[i], snap = 'out') %>%
        mask(poly_sub[i], touches = T)
      npix <- sum(values(spl_hold), na.rm = T)
      spl_hold <- as.polygons(spl_hold)
      names(spl_hold) <- 'POLYTYPE'
      spl_hold$POLYTYPE <- pt
      spl_hold$nbPixels <- npix
      spl_pt <- rbind(spl_pt, spl_hold)
    }
  }
}

# reproject whole FRI to match lidar
poly <- project(poly, spl)

# mask lidar outside of FRI
spl <- mask(spl, poly, inverse = F, touches = T)

# mask WAT polygons
spl <- mask(spl, poly_sub, inverse = T, touches = T)

###############################################
### COMBINE ROAD AND WATER POLYGON DATASETS ###
###############################################

spl_pt <- rbind(spl_pt, spl_r)

##########################################
### DEAL WITH MISSING DATA AND RESCALE ###
##########################################

# if any band is missing values set all to NA
spl[is.na(spl[[1]])] <- NA
spl[is.na(spl[[2]])] <- NA
spl[is.na(spl[[3]])] <- NA

# create function to rescale values from 0 to 100 using 1 and 99 percentile
scale_100 <- function(x) {
  # calculate 1st and 99th percentile of input raster
  perc <-
    values(x, mat = F) %>% quantile(., probs = c(0.01, 0.99), na.rm = T)
  
  # rescale raster using 1st and 99th %
  x <- (x - perc[1]) / (perc[2] - perc[1]) * 100
  
  #reset values below 0 and above 100
  x[x < 0] <- 0
  x[x > 100] <- 100
  
  return(x)
}

# rescale rasters from 0 to 100
spl[[1]] <- scale_100(spl[[1]])
spl[[2]] <- scale_100(spl[[2]])
spl[[3]] <- scale_100(spl[[3]])

# check if main dir exists and create
if (dir.exists(out_dir) == F) {
  dir.create(out_dir)
}

# check if temp dir exists and create
if (dir.exists(file.path(out_dir, 'temp')) == F) {
  dir.create(file.path(out_dir, 'temp'))
}

# write raster to tif
writeRaster(spl,
            filename = str_c(out_dir, '/temp/spl_stack.tif'),
            overwrite = T)

# write spl_pt
writeVector(spl_pt,
            str_c(out_dir, '/temp/water_roads_polygons.shp'),
            overwrite = T)

# SET PARAMETERS
rast_in <- str_c(out_dir, '/temp/spl_stack.tif')
out_p <- str_c(out_dir, '/temp')
name_out <- str_c(
  'grm_',
  thresh,
  '_',
  gsub(".", "", spec, fixed = TRUE),
  '_',
  gsub(".", "", spat, fixed = TRUE)
)

# create function to run generic region merging
grm_otb <-
  function(otb_path = "",
           raster_in = "",
           out_path = "",
           name = "",
           method = "bs",
           thresh = "",
           spec = "0.5",
           spat = "0.5") {
    # Set configuration
    conf <-
      paste(
        "-in",
        raster_in,
        "-out",
        paste(out_path, "/", name, ".tif", sep = ""),
        "-criterion",
        method,
        "-threshold",
        thresh,
        "-cw",
        spec,
        "-sw",
        spat
      )
    
    # apply function in command line
    system(paste(otb_path, "/otbcli_GenericRegionMerging", " ", conf, sep =
                   ""))
    
    # save configuration for further use
    write.table(
      x = conf,
      file = paste(out_path, "/", name, "_conf.txt", sep = ""),
      row.names = F,
      col.names = F
    )
  }

# run grm
grm_otb(
  otb_path = otb_dir,
  raster_in = rast_in,
  out_path = out_p,
  name = name_out,
  thresh = thresh,
  spec = spec,
  spat = spat
)

###########################
### MASK MISSING VALUES ###
###########################

# load grm raster
p <- rast(str_c(out_p, "/", name_out, ".tif"))

# load seg raster
mask <- rast(rast_in) %>% .[[1]]

# mask grm raster
p <- mask(p, mask)

# write grm raster
writeRaster(p, paste(out_p, "/", name_out, ".tif", sep = ""),
            overwrite = T)

# convert to vector based on cell value
vec <- as.polygons(p)

# create table of number of pixels in each polygon
num <- as.vector(values(p))
num_pix <- tabyl(num)

# drop na row
num_pix <- na.omit(num_pix)

# get pixel ids from vector
vec_dat <- tibble(id = values(vec)[, 1])
colnames(vec_dat) <- 'id'

# loop through values and add to vector data
vec_dat$nbPixels <- NA
for (i in 1:NROW(vec_dat)) {
  vec_dat$nbPixels[i] <- num_pix$n[num_pix$num == vec_dat$id[i]]
}

# remove current column of data and add id
# add nbPixels to vector
vec <- vec[, -1]
vec$id <- vec_dat$id
vec$nbPixels <- vec_dat$nbPixels

##################################
### ADD PRE-ALLOCATED POLYGONS ###
##################################

# load polygon dataset
p <- vec

# reproject segmented polygons to ensure same crs
p <- project(p, spl_pt)

# add non-FOR POLYTYPE polygons back in
p2 <- rbind(p, spl_pt)

#####################
### ADD LANDCOVER ###
#####################

# load VLCE 2.0 landcover dataset
lc <- rast(lc_f)

# project polygons to CRS of raster
p_lc <- project(p2, lc)

# crop raster
lc <- crop(lc, p_lc)

# convert to sf
p_lcsf <- st_as_sf(p_lc)

# extract landcover values
lc_vals <- exact_extract(lc, p_lcsf)

# set landcover class key
lc_key <- c(`0` = 'NA',
            `20` = 'Water',
            `31` = 'Snow/Ice',
            `32` = 'Rock/Rubble',
            `33` = 'Exposed/Barren Land',
            `40` = 'Bryoids',
            `50` = 'Shrubland',
            `80` = 'Wetland',
            `81` = 'Wetland-Treed',
            `100` = 'Herbs',
            `210` = 'Coniferous',
            `220` = 'Broadleaf',
            `230` = 'Mixed Wood')

# find dominant lc type in each polygon
# if there are multiple modes keep them
# apply over list
lc_mode <- sapply(lc_vals, function(x){
  x$value <- recode(x$value, !!!lc_key)
  x <- x %>% group_by(value) %>% summarize(sum = sum(coverage_fraction))
  m <- x$value[which(x$sum == max(x$sum))]
  # m <- get_mode2(x$value[x$coverage_fraction >= cov_frac])
  return(paste(m, collapse = " "))
})

# add to polygon dataset
p2$dom_lc <- lc_mode

# set landcover class key with single forested class
lc_key_for <- c(`0` = 'NA',
                `20` = 'Water',
                `31` = 'Snow/Ice',
                `32` = 'Rock/Rubble',
                `33` = 'Exposed/Barren Land',
                `40` = 'Bryoids',
                `50` = 'Shrubland',
                `80` = 'Wetland',
                `81` = 'Forest',
                `100` = 'Herbs',
                `210` = 'Forest',
                `220` = 'Forest',
                `230` = 'Forest')

# find pixels with forest at least 50% of pixel
# apply over list
lc_dom_for <- sapply(lc_vals, function(x){
  x$value <- recode(x$value, !!!lc_key_for)
  x <- x %>% group_by(value) %>% summarize(sum = sum(coverage_fraction))
  m <- x$value[which(x$sum == max(x$sum))]
  if((length(m) == 1) & (m == 'Forest')[1]){
    if(x$sum[x$value == m]/sum(x$sum) >= 0.5){
      return('Yes')
    }else{return('No')}
  }else{return('No')}
})

# add to polygon dataset
p2$dom_for <- lc_dom_for

##############################
### ADD AREA AND PERIMETER ###
##############################

# convert to sf
p2_sf <- st_as_sf(p2)

# calculate perimeter
p2$perim <- st_perimeter(p2_sf) %>% as.numeric

# calculate area
p2$area <- st_area(p2_sf) %>% as.numeric

# write to file
writeVector(p2, str_c(out_dir, "/", name_out, ".shp"),
            overwrite = T)

###########################################
### EXTRACT FINAL POLYGON SUMMARY STATS ###
###########################################

# create list of polygon files, names and parameters
file <- str_c(out_dir, "/", name_out, ".shp")
out_loc <- out_dir
grm_input <- str_c(out_dir, '/temp/spl_stack.tif')
name <- name_out

# create standard error function
se <- function(x)
  sd(x) / sqrt(length(x))

# load file
p <- vect(file)

# convert to sf
p_sf <- st_as_sf(p)

# subset non masked WAT and RD polygons
p2_sf <- p[is.na(p$POLYTYPE)] %>% st_as_sf
p2 <- p[is.na(p$POLYTYPE)] %>% as.data.frame

# calculate perimeter to area ratio
p2$p_to_a <- p2$perim / p2$area
p2$p_to_a <- round(p2$p_to_a, 3)

# calculate msi
p2$msi <- p2$perim / sqrt(pi * p2$area)

# load original raster input file
ras <- rast(grm_input)

# rename bands
names(ras) <- c('p95', 'cc', 'cv')

# extract pixel values
pvals <- exact_extract(ras, p2_sf)

# calculate SSE
sse <- sapply(
  pvals,
  FUN = function(x) {
    p95_mean <- mean(x$p95, na.rm = T)
    cc_mean <- mean(x$cc, na.rm = T)
    cv_mean <- mean(x$cv, na.rm = T)
    
    return(c(sum((x$p95 - p95_mean) ^ 2, na.rm = T),
             sum((x$cc - cc_mean) ^ 2, na.rm = T),
             sum((x$cv - cv_mean) ^ 2, na.rm = T)))
  }
)

# transpose
sse <- t(sse)

# calculate final sums
sse <- colSums(sse)

# unlist values
pvals2 <- do.call(rbind, pvals)

# calculate global mean values
p95_mean <- mean(pvals2$p95, na.rm = T)
cc_mean <- mean(pvals2$cc, na.rm = T)
cv_mean <- mean(pvals2$cv, na.rm = T)

rm(pvals2)

# calculate SST
sst <- sapply(
  pvals,
  FUN = function(x) {
    return(c(sum((x$p95 - p95_mean) ^ 2, na.rm = T),
             sum((x$cc - cc_mean) ^ 2, na.rm = T),
             sum((x$cv - cv_mean) ^ 2, na.rm = T)))
  }
)

# transpose
sst <- t(sst)

# calculate final sums
sst <- colSums(sst)

# calculate r2 values
r2_p95 <- 1 - (sse[1] / sst[1]) %>% round(3)
r2_cc <- 1 - (sse[2] / sst[2]) %>% round(3)
r2_cv <- 1 - (sse[3] / sst[3]) %>% round(3)
r2_all <- (sum(r2_p95, r2_cc, r2_cv) / 3) %>% round(3)

# create dataframe with values wanted
df <- data.frame(
  alg = name,
  min_pix = (min(p2$nbPixels)),
  max_pix = (max(p2$nbPixels)),
  mean_pix = (mean(p2$nbPixels)),
  med_pix = (median(p2$nbPixels)),
  num_poly = NROW(p2),
  mean_area = mean(p2$area),
  se_area = se(p2$area),
  sd_area = sd(p2$area),
  mean_perim = mean(p2$perim),
  se_perim = se(p2$perim),
  sd_perim = sd(p2$perim),
  mean_p_a = mean(p2$p_to_a),
  se_p_a = se(p2$p_to_a),
  sd_p_a = sd(p2$p_to_a),
  mean_msi = mean(p2$msi),
  se_msi = se(p2$msi),
  sd_msi = sd(p2$msi),
  r2_p95 = r2_p95,
  r2_cc = r2_cc,
  r2_cv = r2_cv,
  r2_all = r2_all
)

# round numeric columns
df %<>%
  mutate_at(c(
    'min_pix',
    'max_pix',
    'mean_pix',
    'med_pix',
    'mean_area',
    'se_area',
    'sd_area',
    'mean_perim',
    'se_perim',
    'sd_perim'
  ),
  function(x)
    round(x, 2)) %>%
  mutate_at(c('mean_p_a',
              'se_p_a',
              'sd_p_a',
              'mean_msi',
              'se_msi',
              'sd_msi'),
            function(x)
              round(x, 4))

#####################
### ADD FRI STATS ###
#####################

# load interpreter derived polygons to extract statistics
pfri <- vect(fri)

# convert to sf
pfri_sf <- st_as_sf(pfri)

# calculate perimeter
pfri$perim <- st_perimeter(pfri_sf) %>% as.numeric

# calculate area
pfri$area <- st_area(pfri_sf) %>% as.numeric

# calculate nbPixels
pfri$nbPixels <- pfri$area / 400

# calculate perimeter to area ratio
pfri$p_to_a <- pfri$perim / pfri$area
pfri$p_to_a <- round(pfri$p_to_a, 3)

# subset all non water/ucl polygons
p2fri_sf <- pfri[!(pfri$POLYTYPE %in% c('WAT', 'UCL'))] %>% st_as_sf
p2fri <- pfri[!(pfri$POLYTYPE %in% c('WAT', 'UCL'))] %>% as.data.frame

# calculate msi
p2fri$msi <- p2fri$perim / sqrt(pi * p2fri$area)

# load original raster input file
ras <- rast(grm_input)

# rename bands
names(ras) <- c('p95', 'cc', 'cv')

# extract pixel values
pvals <- exact_extract(ras, p2fri_sf)

# calculate SSE
sse <- sapply(
  pvals,
  FUN = function(x) {
    # subset values based on coverage fraction
    x %<>% filter(coverage_fraction >= 0.5)
    
    p95_mean <- mean(x$p95, na.rm = T)
    cc_mean <- mean(x$cc, na.rm = T)
    cv_mean <- mean(x$cv, na.rm = T)
    
    return(c(sum((x$p95 - p95_mean) ^ 2, na.rm = T),
             sum((x$cc - cc_mean) ^ 2, na.rm = T),
             sum((x$cv - cv_mean) ^ 2, na.rm = T)))
  }
)

# transpose
sse <- t(sse)

# calculate final sums
sse <- colSums(sse)

# unlist values
pvals2 <- do.call(rbind, pvals)

# subset values based on coverage fraction
pvals2 %<>% filter(coverage_fraction >= 0.5)

# calculate global mean values
p95_mean <- mean(pvals2$p95, na.rm = T)
cc_mean <- mean(pvals2$cc, na.rm = T)
cv_mean <- mean(pvals2$cv, na.rm = T)

rm(pvals2)

# calculate SST
sst <- sapply(
  pvals,
  FUN = function(x) {
    # subset values based on coverage fraction
    x %<>% filter(coverage_fraction >= 0.5)
    
    return(c(sum((x$p95 - p95_mean) ^ 2, na.rm = T),
             sum((x$cc - cc_mean) ^ 2, na.rm = T),
             sum((x$cv - cv_mean) ^ 2, na.rm = T)))
  }
)

# transpose
sst <- t(sst)

# calculate final sums
sst <- colSums(sst)

# calculate r2 values
r2_p95 <- 1 - (sse[1] / sst[1]) %>% round(3)
r2_cc <- 1 - (sse[2] / sst[2]) %>% round(3)
r2_cv <- 1 - (sse[3] / sst[3]) %>% round(3)
r2_all <- (sum(r2_p95, r2_cc, r2_cv) / 3) %>% round(3)

# create dataframe with values wanted
ms_df <- data.frame(
  alg = 'FRI',
  min_pix = (min(p2fri$area / 400)),
  max_pix = (max(p2fri$area / 400)),
  mean_pix = (mean(p2fri$area / 400)),
  med_pix = (median(p2fri$area / 400)),
  num_poly = NROW(p2fri),
  mean_area = mean(p2fri$area),
  se_area = se(p2fri$area),
  sd_area = sd(p2fri$area),
  mean_perim = mean(p2fri$perim),
  se_perim = se(p2fri$perim),
  sd_perim = sd(p2fri$perim),
  mean_p_a = mean(p2fri$p_to_a),
  se_p_a = se(p2fri$p_to_a),
  sd_p_a = sd(p2fri$p_to_a),
  mean_msi = mean(p2fri$msi),
  se_msi = se(p2fri$msi),
  sd_msi = sd(p2fri$msi),
  r2_p95 = r2_p95,
  r2_cc = r2_cc,
  r2_cv = r2_cv,
  r2_all = r2_all
)

# round numeric columns
ms_df %<>%
  mutate_at(c('min_pix',
              'max_pix',
              'mean_pix',
              'med_pix'),
            function(x)
              round(x)) %>%
  mutate_at(c(
    'mean_area',
    'se_area',
    'sd_area',
    'mean_perim',
    'se_perim',
    'sd_perim'
  ),
  function(x)
    round(x, 2)) %>%
  mutate_at(c('mean_p_a',
              'se_p_a',
              'sd_p_a',
              'mean_msi',
              'se_msi',
              'sd_msi'),
            function(x)
              round(x, 4))

# bind df
df <- rbind(df, ms_df)

# write df as csv
write.csv(df,
          file = str_c(out_loc, '/summary_stats.csv'),
          row.names = F)

6.1.2 Results: Summary Statistics

Polygon size statistics:

###########################################
### Summary Stats and Additional Tables ###
###########################################

# Size
t1 <- as_tibble(df[, 1:6])

# change to ha
t1[, 2:5] <- round(t1[, 2:5] / 25, 2)

names(t1) <- c('Dataset', "Min Ha", 'Max Ha', "Mean Ha",
               "Median Ha", "Number of Polygons")
knitr::kable(t1, caption = "Table 1: Polygon Size Stats", label = NA)

Table 1: Polygon Size Stats
Dataset	Min Ha	Max Ha	Mean Ha	Median Ha	Number of Polygons
grm_10_01_05	0.04	60.00	4.71	3.40	223321
FRI	0.00	821.08	5.24	1.16	207121

The FSF is much larger than the RMF, with over twice as many polygons. The mean polygon size is around ~5 Ha for both the GRM polygons and the FRI, but the median polygon size is much smaller in the FRI. This tells us that in the FSF FRI there are more small polygons, but also a number of polygons with a large size, compared to GRM segmentation.

Area and perimeter:

# Area and Perim
t2 <- as_tibble(df[, c(1, 7:12)])

# change area to Ha
t2[,2:4] <- round(t2[,2:4] / 10000, 2)

names(t2) <- c("Dataset", "Mean Area (Ha)", "SE Area", "SD Area",
               "Mean Perimeter (m)", "SE Perimeter", "SD Perimeter")
knitr::kable(t2, caption = "Table 2: Polygon Area and Perimeter Stats", label = NA)

Table 2: Polygon Area and Perimeter Stats
Dataset	Mean Area (Ha)	SE Area	SD Area	Mean Perimeter (m)	SE Perimeter	SD Perimeter
grm_10_01_05	4.71	0.01	4.27	1155.00	1.09	517.22
FRI	5.25	0.02	10.49	1257.18	3.78	1719.51

GRM mean area and perimeter are smaller than the FRI, with less variability.

Shape index:

# Mean Shape Index
t3 <- as_tibble(df[, c(1, 16:18)])
names(t3) <- c("Dataset", "Mean Shape Index", "SE Shape Index", "SD Shape Index")
knitr::kable(t3, caption = "Table 3: Polygon Shape Index Stats", label = NA)

Table 3: Polygon Shape Index Stats
Dataset	Mean Shape Index	SE Shape Index	SD Shape Index
grm_10_01_05	3.3014	0.0015	0.6915
FRI	4.1344	0.0035	1.6105

GRM polygons are more compact.

R2:

# R2
t4 <- as_tibble(df[,c(1, 19:22)])

# round
t4[,2:5] <- round(t4[,2:5], 2)

names(t4) <- c('Dataset', 'R2 P95', 'R2 Can Cov', 'R2 Coeff Var', 'R2 Overall')
knitr::kable(t4, caption = "Table 4: Polygon R2 Stats", label = NA)

Table 4: Polygon R2 Stats
Dataset	R2 P95	R2 Can Cov	R2 Coeff Var	R2 Overall
grm_10_01_05	0.88	0.90	0.88	0.88
FRI	0.82	0.82	0.77	0.80

GRM polygons have an overall R2 value of 0.88 compared to 0.80 from the FRI.

6.1.3 Results: Distribution Plots

Polygon size:

# plot density GRM in ha
g1 <- ggplot(data.frame(nbPixels = p2$nbPixels / 25), aes(x = nbPixels)) +
  geom_density(fill = 'grey') +
  xlim(c(0, 500/25)) +
  ylim(c(0, 0.60)) +
  geom_vline(aes(xintercept = median(nbPixels)),
             linetype = "dashed",
             linewidth = 0.6) +
  theme_bw() +
  xlab('Number of Hectares') +
  ylab('Density') +
  ggtitle('GRM Distribution of Polygon Size') +
  theme(text = element_text(size = 25))

# plot density FRI in ha
g2 <- ggplot(data.frame(nbPixels = p2fri$nbPixels / 25), aes(x = nbPixels)) +
  geom_density(fill = 'grey') +
  xlim(c(0, 500/25)) +
  ylim(c(0, 0.60)) +
  geom_vline(aes(xintercept = median(nbPixels)),
             linetype = "dashed",
             linewidth = 0.6) +
  theme_bw() +
  xlab('Number of Hectares') +
  ylab('Density') +
  ggtitle('FRI Distribution of Polygon Size') +
  theme(text = element_text(size = 25))

# plot together
grid.arrange(g1, g2)

The density plots of polygon size give us a good indication of how the datasets differ. GRM polygons have a much larger density of polygons in the ~2-6 Ha range, whereas the FRI has a large number of small polygons, and also a fatter tail of ~10-20 Ha polygons.

Shape index:

# plot shape index GRM
g1 <- ggplot(data.frame(msi = as.numeric(p2$msi)), aes(x = msi)) +
  geom_density(fill = 'grey') +
  xlim(c(0, 8)) +
  ylim(c(0, 1.5)) +
  geom_vline(aes(xintercept = median(msi)),
             linetype = "dashed",
             linewidth = 0.6) +
  theme_bw() +
  xlab('Shape Index') +
  ylab('Density') +
  ggtitle('GRM Distribution of Shape Index') +
  theme(text = element_text(size = 25))

# plot shape index FRI
g2 <- ggplot(data.frame(msi = as.numeric(p2fri$msi)), aes(x = msi)) +
  geom_density(fill = 'grey') +
  xlim(c(0, 8)) +
  ylim(c(0, 1.5)) +
  geom_vline(aes(xintercept = median(msi)),
             linetype = "dashed",
             linewidth = 0.6) +
  theme_bw() +
  xlab('Shape Index') +
  ylab('Density') +
  ggtitle('FRI Distribution of Shape Index') +
  theme(text = element_text(size = 25))

# plot together
grid.arrange(g1, g2)

We can see the distribution of polygon shape is more compact among GRM polygons. Remember that we are using the same GRM algorithm parameters as were used in segmenting the RMF. These parameters can be manipulated to meet the needs of various forested areas with different forest characteristics.

6.2 Imputation Over FRI

6.2.1 Run Algorithm

##################################
### INSTALL PACKAGES IF NEEDED ###
##################################

# install.packages(c('terra',
#                    'tidyverse',
#                    'exactextractr',
#                    'sf',
#                    'magrittr',
#                    'gridExtra',
#                    'RANN',
#                    'reshape2',
#                    'viridis',
#                    'scales',
#                    'janitor',
#                    'kableExtra',
#                    'knitr'))

# make sure to have OTB installed from here:
# https://www.orfeo-toolbox.org/

#####################
### LOAD PACKAGES ###
#####################

# load packages
library(terra)
library(tidyverse)
library(exactextractr)
library(sf)
library(magrittr)
library(gridExtra)
library(RANN)
library(reshape2)
library(viridis)
library(scales)
library(janitor)
library(kableExtra)
library(knitr)

####################################
### SET CODE AND FILE PARAMETERS ###
####################################

# set file names for ALS input variables

# first set the imputation X-variables
lidar_imp <- c('agb' = 'D:/ontario_inventory/FSF/EFI/Biomass.img',
               'gmvwl' = 'D:/ontario_inventory/FSF/EFI/GMV_WL.img',
               'pcum8' = 'D:/ontario_inventory/FSF/ALS/zpcum8.img',
               'ba' = 'D:/ontario_inventory/FSF/EFI/BasalArea.img',
               'b6' = 'D:/ontario_inventory/FSF/sentinel/boa/b6_fsf_2018.tif')

# next set the data screening variables
lidar_scr <- c('p95' = 'D:/ontario_inventory/FSF/ALS/zq95.img',
               'cc' = 'D:/ontario_inventory/FSF/ALS/cov_2m.img')

# set file location of FRI polygons shape file
fri <- 'D:/ontario_inventory/FSF/FRI/FSF_opi_polygon_CSRS_NAD83_17.shp'

# set file location of GRM polygons shape file
grm <- 'C:/Users/bermane/Desktop/FSF/grm_10_01_05.shp'

# set output folder for files generated
# make sure no "/" at end of folder location!
out_dir <- 'C:/Users/bermane/Desktop/FSF'

# set file location of 2018 VLCE 2.0 landcover data
# using 2018 because it is the year of Romeo ALS acquisition
# can change based on ALS acquisition year
# download here:
# https://opendata.nfis.org/mapserver/nfis-change_eng.html
lc_f <- 'D:/ontario_inventory/VLCE/CA_forest_VLCE2_2018.tif'

###########################################
### EXTRACT VARIABLES INTO FRI POLYGONS ###
###########################################

# load FRI polygons
poly <- vect(fri)

# convert to df
dat_fri <- as.data.frame(poly)

# cbind centroids to dat
dat_fri <- cbind(dat_fri, centroids(poly) %>% crds)

# combine all LiDAR and aux variables to extract
lidar_vars <- c(lidar_imp, lidar_scr)

# loop through LiDAR attributes to extract values
for (i in seq_along(lidar_vars)) {
  # load LiDAR raster
  lidar_ras <- rast(lidar_vars[i])
  
  # project poly to crs of raster
  poly_ras <- project(poly, lidar_ras)
  
  # convert to sf
  poly_ras <- st_as_sf(poly_ras)
  
  #extract median values
  vec <-
    exact_extract(lidar_ras, poly_ras, 'median')
  
  # aggregate into data frame
  if(i == 1){
    vec_df <- as.data.frame(vec)
  } else{
    vec_df <- cbind(vec_df, as.data.frame(vec))
  }
}

# change column names of extracted attribute data frame
colnames(vec_df) <- names(lidar_vars)

# add LiDAR attributes to FRI polygon data frame
dat_fri <- cbind(dat_fri, vec_df)

# add 2018 age values
dat_fri$AGE2018 <- 2018 - dat_fri$YRORG

# check if main dir exists and create
if (dir.exists(out_dir) == F) {
  dir.create(out_dir)
}

# check if temp dir exists and create
if (dir.exists(file.path(out_dir, 'temp')) == F) {
  dir.create(file.path(out_dir, 'temp'))
}

# save extracted dataframe for fast rebooting
save(dat_fri, file = str_c(out_dir, '/temp/dat_fri_extr.RData'))

# load GRM segmented polygons
poly <- vect(grm)

# reproject to match FRI polygons
poly <- project(poly, vect(fri))

# convert to df
dat_grm <- as.data.frame(poly)

# cbind centroids to dat
dat_grm <- cbind(dat_grm, centroids(poly) %>% crds)

# loop through LiDAR attributes to extract values
for (i in seq_along(lidar_vars)) {
  # load LiDAR raster
  lidar_ras <- rast(lidar_vars[i])
  
  # project poly to crs of raster
  poly_ras <- project(poly, lidar_ras)
  
  # convert to sf
  poly_ras <- st_as_sf(poly_ras)
  
  #extract median values
  vec <-
    exact_extract(lidar_ras, poly_ras, 'median')
  
  # aggregate into data frame
  if(i == 1){
    vec_df <- as.data.frame(vec)
  } else{
    vec_df <- cbind(vec_df, as.data.frame(vec))
  }
}

# change column names of extracted attribute data frame
colnames(vec_df) <- names(lidar_vars)

# add LiDAR attributes to FRI polygon data frame
dat_grm <- cbind(dat_grm, vec_df)

# save extracted dataframe for fast rebooting
save(dat_grm, file = str_c(out_dir, '/temp/dat_grm_extr.RData'))

############################################
### FUNCTIONS FOR SPECIES CLASSIFICATION ###
############################################

# assign species name -- note this list was updated with all species in FSF FRI.
# It may need to be adjusted for other areas

assign_common_name <- function(sp_abbrev) {
  sp_abbrev  <- toupper(sp_abbrev)
  
  dict <- data.frame(SB = "black spruce", 
                     LA = "eastern larch", 
                     BW = "white birch", 
                     BF = "balsam fir", 
                     CE = "cedar", 
                     SW = "white spruce", 
                     PT = "trembling aspen", 
                     PJ = "jack pine", 
                     PO = "poplar", 
                     PB = "balsam poplar", 
                     PR = "red pine", 
                     PW = "white pine", 
                     SX = "spruce", 
                     MR = "red maple", 
                     AB = "black ash", 
                     BY = "yellow birch",
                     OR = 'red oak',
                     CW = 'eastern white cedar',
                     MH = 'hard maple',
                     HE = 'eastern hemlock',
                     BD = 'basswood',
                     CB = 'black cherry',
                     BE = 'american beech',
                     AW = 'white ash',
                     PL = 'largetooth aspen',
                     AG = 'red ash',
                     OW = 'white oak',
                     IW = 'ironwood',
                     OB = 'bur oak',
                     EW = 'white elm',
                     MS = 'silver maple',
                     PS = 'scots pine',
                     OH = 'other hardwoods',
                     BG = 'grey birch',
                     AL = 'alder',
                     SR = 'red spruce',
                     BB = 'blue beech',
                     MT = 'mountain maple',
                     MB = 'black maple',
                     OC = 'other conifers',
                     SN = 'norway spruce',
                     PE = 'silver poplar',
                     HI = 'hickory',
                     AX = 'ash') %>%
    pivot_longer(everything(), names_to = "abb", values_to = "common")
  
  dict$common[match(sp_abbrev, dict$abb)]
  
}

# assign either coniferous or deciduous
assign_type <- function(sp_common) {
  sp_common  <- tolower(sp_common)
  ifelse(stringr::str_detect(sp_common, pattern = "pine|spruce|fir|cedar|larch|conifers|hemlock"), "Coniferous", "Deciduous")
}

####################################
### CALCULATE SPECIES ATTRIBUTES ###
####################################

# load fri
poly_fri <- st_read(fri)

# separate SPCOMP string into individual columns
poly_fri_for <- poly_fri %>%
  st_drop_geometry() %>%
  filter(POLYTYPE == "FOR") %>%
  select(OPI_ID, POLYTYPE, OSPCOMP) %>% # these need to match FRI attr fields
  mutate(new_SP = str_match_all(OSPCOMP, "[A-Z]{2}[ ]+[0-9]+")) %>%
  unnest(new_SP) %>%
  mutate(new_SP = as.character(new_SP)) %>%
  separate(new_SP, into = c("SP", "PROP")) %>%
  mutate(PROP = as.numeric(PROP),
         Common = assign_common_name(SP),
         sp_type = assign_type(Common))

# calculate polygon level species groups
# percent species type
poly_dom_type <- poly_fri_for %>%
  group_by(OPI_ID) %>%
  summarize(per_conif = sum(PROP[sp_type == "Coniferous"]), 
            per_decid = sum(PROP[sp_type == "Deciduous"]))

# leading species
poly_dom_sp <- poly_fri_for %>%
  group_by(OPI_ID) %>%
  slice_max(PROP, n = 1, with_ties = FALSE)

# combine type with leading species
poly_dom_sp_group <- inner_join(poly_dom_type, poly_dom_sp, by = "OPI_ID")

# calculate functional groups
poly_dom_sp_group <- poly_dom_sp_group %>%
  mutate(SpeciesGroup1 = ifelse(PROP >= 70, Common, 
                                ifelse(PROP < 70 & per_conif >= 70, "Mixed Coniferous",
                                       ifelse(PROP < 70 & per_decid >= 70, "Mixed Deciduous", "Mixedwoods"))), 
         SpeciesGroup2 = ifelse(Common == "jack pine" & PROP >= 50 & per_conif >= 70, "Jack Pine Dominated", ifelse(
           Common == "black spruce" & PROP >= 50 & per_conif >= 70, "Black Spruce Dominated", ifelse(
             per_decid >= 70, "Hardwood", ifelse(
               per_decid >= 30 & per_decid <= 70 & per_conif >= 30 & per_conif <= 70, "Mixedwood", "Mixed Conifers"
             )))), 
         SpeciesGroup3 = ifelse(per_conif >= 70, "Softwood", 
                                ifelse(per_decid >= 70, "Hardwood", "Mixedwood"))) %>%
  mutate(across(.cols = starts_with("SpeciesGroup"), .fns = as.factor)) %>%
  select(OPI_ID, SpeciesGroup2, SpeciesGroup3) %>%
  rename(class5 = SpeciesGroup2, class3 = SpeciesGroup3)

# calculate leading and second species only
poly_fri_sp <- poly_fri %>%
  st_drop_geometry() %>%
  filter(POLYTYPE == "FOR") %>%
  select(OPI_ID, POLYTYPE, OSPCOMP) %>% # these need to match FRI attr fields
  mutate(new_SP = str_match_all(OSPCOMP, "[A-Z]{2}[ ]+[0-9]+")) %>%
  unnest_wider(new_SP, names_sep = '') %>%
  rename(new_SP = new_SP1) %>%
  mutate(new_SP = as.character(new_SP),
         new_SP2 = as.character(new_SP2)) %>%
  separate(new_SP, into = c("SP", "PROP")) %>%
  separate(new_SP2, into = c("SP2", "PROP2")) %>%
  select(OPI_ID, SP, SP2)

# join functional groups with leading and second species
poly_dom_sp_group <- left_join(poly_dom_sp_group,
                               poly_fri_sp,
                               by = 'OPI_ID') %>%
  rename(SP1 = SP)

# join to FRI extracted dataframe
dat_fri <- left_join(dat_fri,
                     poly_dom_sp_group,
                     by = 'OPI_ID')

# re-save extracted dataframe for fast rebooting
save(dat_fri, file = str_c(out_dir, '/temp/dat_fri_extr.RData'))

#####################
### LOAD FRI DATA ###
#####################

# load FRI polygons
poly <- vect(fri)

# load FRI polygon data frame
load(str_c(out_dir, '/temp/dat_fri_extr.RData'))

#############################
### DATA SCREENING PART 1 ###
#############################

# remove all non-forested polygons
dat_fri <- filter(dat_fri, POLYTYPE == 'FOR')

# create smaller polygon set only FOR polytypes
poly_fri <- poly[poly$POLYTYPE == 'FOR']

#############################
### DATA SCREENING PART 2 ###
#############################

# polygon landcover > 50% forested

# load VLCE 2.0 landcover dataset from 2018
lc <- rast(lc_f)

# project poly to crs of raster
poly_lc <- project(poly_fri, lc)

# convert to sf
poly_lcsf <- st_as_sf(poly_lc)

# extract landcover values
lc_poly <- exact_extract(lc, poly_lcsf)

# set landcover class key with single forested class
lc_key_for <- c(`0` = 'NA',
                `20` = 'Water',
                `31` = 'Snow/Ice',
                `32` = 'Rock/Rubble',
                `33` = 'Exposed/Barren Land',
                `40` = 'Bryoids',
                `50` = 'Shrubland',
                `80` = 'Wetland',
                `81` = 'Forest',
                `100` = 'Herbs',
                `210` = 'Forest',
                `220` = 'Forest',
                `230` = 'Forest')

# find pixels with forest at least 50% of pixel
# apply over list
lc_dom_for <- sapply(lc_poly, function(x){
  x$value <- recode(x$value, !!!lc_key_for)
  x <- x %>% group_by(value) %>% summarize(sum = sum(coverage_fraction))
  m <- x$value[which(x$sum == max(x$sum))]
  if((length(m) == 1) & (m == 'Forest')[1]){
    if(x$sum[x$value == m]/sum(x$sum) >= 0.5){
      return('Yes')
    }else{return('No')}
  }else{return('No')}
})

# add new columns into dat
dat_fri <- dat_fri %>% add_column(dom_for = lc_dom_for)

# subset FRI data frame based on whether polygon dominated by forest
dat_fri_scr <- dat_fri %>% filter(dom_for == 'Yes')

#############################
### DATA SCREENING PART 3 ###
#############################

# require p95 >= 5
# require cc >= 50
dat_fri_scr %<>% filter(p95 >= 5, cc >= 50)

# save extracted data frame for fast rebooting
save(dat_fri_scr, file = str_c(out_dir, '/temp/dat_fri_scr.RData'))

#####################
### LOAD GRM DATA ###
#####################

# load GRM polygon data frame
load(str_c(out_dir, '/temp/dat_grm_extr.RData'))

######################
### DATA SCREENING ###
######################

# Don't screen GRM polygons for POLYTYPE == FOR

# polygon landcover > 50% forested
# dom_for attribute already exists in GRM data from segmentation

# require p95 >= 5
# require cc >= 50

dat_grm_scr <- dat_grm %>% filter(dom_for == 'Yes',
                                  p95 >= 5,
                                  cc >= 50)

# save extracted data frame for fast rebooting
save(dat_grm_scr, file = str_c(out_dir, '/temp/dat_grm_scr.RData'))

######################################################
### FUNCTIONS TO RUN K NEAREST NEIGHBOR IMPUTATION ###
######################################################

# create mode function
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

# create rmsd function
rmsd <- function(obs, est){
  sqrt(mean((est - obs) ^ 2))
}

# create rrmsd function
rrmsd <- function(obs, est){
  sqrt(mean((est - obs) ^ 2)) / mean(obs) * 100
}

# create mae function
mae <- function(obs, est){
  mean(abs(est - obs))
}

# create mbe function
mbe <- function(obs, est){
  mean(est - obs)
}

# create rmbe function
rmbe <- function(obs, est){
  mean(est - obs) / mean(obs) * 100
}

# create knn function to output performance results
run_knn_fri <- function(dat, vars, k) {
  
  # subset data
  dat_nn <- dat %>% select(all_of(vars))
  
  # scale for nn computation
  dat_nn_scaled <- dat_nn %>% scale
  
  # run nearest neighbor
  nn <- nn2(dat_nn_scaled, dat_nn_scaled, k = k + 1)
  
  # get nn indices
  nni <- nn[[1]][, 2:(k + 1)]
  
  # add vars to tibble
  # take mean/mode if k > 1
  if(k > 1){
    for(i in seq_along(vars)){
      if(i == 1){
        nn_tab <- tibble(!!vars[i] := dat_nn[,i],
                         !!str_c(vars[i], '_nn') := apply(nni, MARGIN = 1, FUN = function(x){
                           mean(dat_nn[x, i])
                         }))
      }else{
        nn_tab %<>% mutate(!!vars[i] := dat_nn[,i],
                           !!str_c(vars[i], '_nn') := apply(nni, MARGIN = 1, FUN = function(x){
                             mean(dat_nn[x, i])
                           }))
      }
    }
    
    # add target vars to tibble
    nn_tab %<>% mutate(age = dat$AGE2018,
                       sp1 = dat$SP1,
                       sp2 = dat$SP2,
                       class5 = dat$class5,
                       class3 = dat$class3,
                       age_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         mean(dat$AGE2018[x])
                       }),
                       sp1_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$SP1[x])
                       }),
                       sp2_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$SP2[x])
                       }),
                       class5_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$class5[x])
                       }),
                       class3_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$class3[x])
                       }))
  }
  
  # take direct nn if k == 1
  if(k == 1){
    for(i in seq_along(vars)){
      if(i == 1){
        nn_tab <- tibble(!!vars[i] := dat_nn[,i],
                         !!str_c(vars[i], '_nn') := dat_nn[nn[[1]][,2],i])
      }else{
        nn_tab %<>% mutate(!!vars[i] := dat_nn[,i],
                           !!str_c(vars[i], '_nn') := dat_nn[nn[[1]][,2],i])
      }
    }
    
    # add target vars to tibble
    nn_tab %<>% mutate(age = dat$AGE2018,
                       sp1 = dat$SP1,
                       sp2 = dat$SP2,
                       class5 = dat$class5,
                       class3 = dat$class3,
                       age_nn = dat$AGE2018[nn[[1]][,2]],
                       sp1_nn = dat$SP1[nn[[1]][,2]],
                       sp2_nn = dat$SP2[nn[[1]][,2]],
                       class5_nn = dat$class5[nn[[1]][,2]],
                       class3_nn = dat$class3[nn[[1]][,2]])
  }
  
  
  # calculate fit metrics for vars
  for(i in seq_along(vars)){
    if(i == 1){
      perform_df <- tibble(variable = vars[i],
                           metric = c('rrmsd (%)', 'rmbe (%)'),
                           value = c(rrmsd(pull(nn_tab, vars[i]),
                                  pull(nn_tab, str_c(vars[i], '_nn'))),
                                  rmbe(pull(nn_tab, vars[i]),
                                  pull(nn_tab, str_c(vars[i], '_nn')))))
    }else{
      perform_df %<>% add_row(variable = vars[i],
                           metric = c('rrmsd (%)', 'rmbe (%)'),
                           value = c(rrmsd(pull(nn_tab, vars[i]),
                                  pull(nn_tab, str_c(vars[i], '_nn'))),
                                  rmbe(pull(nn_tab, vars[i]),
                                  pull(nn_tab, str_c(vars[i], '_nn')))))
    }
  }
  
  # calculate metrics for age
  perform_df %<>% add_row(variable = 'age',
                          metric = c('rmsd (yrs)', 'mbe (yrs)', 'mae (yrs)'),
                          value = c(rmsd(nn_tab$age, nn_tab$age_nn),
                                    mbe(nn_tab$age, nn_tab$age_nn),
                                    mae(nn_tab$age, nn_tab$age_nn)))
  
  # calculate SP1 accuracy
  # create df of SP1
  sp1 <- data.frame(obs = nn_tab$sp1,
                    est = nn_tab$sp1_nn)
  
  # create column of match or not
  sp1$match <- sp1$obs == sp1$est
  
  # add total percent of matching SP1 to perform_df
  perform_df %<>% add_row(variable = 'leading species',
                          metric = 'accuracy (%)',
                          value = NROW(sp1[sp1$match == T,]) /
                                   NROW(sp1) * 100)
  
  # calculate SP2 accuracy
  # create df of SP2
  sp2 <- data.frame(obs = nn_tab$sp2,
                    est = nn_tab$sp2_nn)
  
  # create column of match or not
  sp2$match <- sp2$obs == sp2$est
  
  # add total percent of matching SP2 to perform_df
  perform_df %<>% add_row(variable = 'second species', 
                        metric = 'accuracy (%)',
                        value = NROW(sp2[sp2$match == T,]) /
                                   NROW(sp2) * 100)
  
  # calculate  class3 accuracy
  # create df of class3
  class3 <- data.frame(obs = nn_tab$class3,
                       est = nn_tab$class3_nn)
  
  # create column of match or not
  class3$match <- class3$obs == class3$est
  
  # add total percent of matching class3 to perform_df
  perform_df %<>% add_row(variable = 'three func group class',
                          metric = 'accuracy (%)',
                          value = NROW(class3[class3$match == T,]) /
                                   NROW(class3) * 100)
  
  # calculate class5 accuracy
  # create df of class5
  class5 <- data.frame(obs = nn_tab$class5,
                       est = nn_tab$class5_nn)
  
  # create column of match or not
  class5$match <- class5$obs == class5$est
  
  # add total percent of matching class5 to perform_df
  perform_df %<>% add_row(variable = 'five func group class',
                        metric = 'accuracy (%)',
                        value = NROW(class5[class5$match == T,]) /
                                   NROW(class5) * 100)
  
  # return df
  return(perform_df)
}

# create knn function to output imputed vs. observed table
run_knn_fri_table <- function(dat, vars, k) {
  
  # subset data
  dat_nn <- dat %>% select(all_of(vars))
  
  # scale for nn computation
  dat_nn_scaled <- dat_nn %>% scale
  
  # run nearest neighbor
  nn <- nn2(dat_nn_scaled, dat_nn_scaled, k = k + 1)
  
  # get nn indices
  nni <- nn[[1]][, 2:(k + 1)]
  
  # add vars to tibble
  # take mean/mode if k > 1
  if(k > 1){
    for(i in seq_along(vars)){
      if(i == 1){
        nn_tab <- tibble(!!vars[i] := dat_nn[,i],
                         !!str_c(vars[i], '_nn') := apply(nni, MARGIN = 1, FUN = function(x){
                           mean(dat_nn[x, i])
                         }))
      }else{
        nn_tab %<>% mutate(!!vars[i] := dat_nn[,i],
                           !!str_c(vars[i], '_nn') := apply(nni, MARGIN = 1, FUN = function(x){
                             mean(dat_nn[x, i])
                           }))
      }
    }
    
    # add target vars to tibble
    nn_tab %<>% mutate(age = dat$AGE2018,
                       sp1 = dat$SP1,
                       sp2 = dat$SP2,
                       class5 = dat$class5,
                       class3 = dat$class3,
                       age_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         mean(dat$AGE2018[x])
                       }),
                       sp1_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$SP1[x])
                       }),
                       sp2_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$SP2[x])
                       }),
                       class5_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$class5[x])
                       }),
                       class3_nn = apply(nni, MARGIN = 1, FUN = function(x){
                         getmode(dat$class3[x])
                       }))
  }
  
  # take direct nn if k == 1
  if(k == 1){
    for(i in seq_along(vars)){
      if(i == 1){
        nn_tab <- tibble(!!vars[i] := dat_nn[,i],
                         !!str_c(vars[i], '_nn') := dat_nn[nn[[1]][,2],i])
      }else{
        nn_tab %<>% mutate(!!vars[i] := dat_nn[,i],
                           !!str_c(vars[i], '_nn') := dat_nn[nn[[1]][,2],i])
      }
    }
    
    # add target vars to tibble
    nn_tab %<>% mutate(age = dat$AGE2018,
                       sp1 = dat$SP1,
                       sp2 = dat$SP2,
                       class5 = dat$class5,
                       class3 = dat$class3,
                       age_nn = dat$AGE2018[nn[[1]][,2]],
                       sp1_nn = dat$SP1[nn[[1]][,2]],
                       sp2_nn = dat$SP2[nn[[1]][,2]],
                       class5_nn = dat$class5[nn[[1]][,2]],
                       class3_nn = dat$class3[nn[[1]][,2]])
  }
  
  # return nn table
  return(nn_tab)
}

########################################################
### RUN KNN IMPUTATION USING OPTIMAL MODEL VARIABLES ###
########################################################

# load FRI screened polygons
load(str_c(out_dir, '/temp/dat_fri_scr.RData'))

# subset only the attributes we need from the screened FRI polygons
dat_fri_scr %<>% select(OPI_ID, AGE2018, agb,
                        gmvwl, pcum8, ba,
                        b6, x, y, SP1, SP2, class3, class5)

# remove any polygons with missing values
# in imputation X-variables
dat_fri_scr <- left_join(dat_fri_scr %>% select(OPI_ID, AGE2018, agb,
                        gmvwl, pcum8, ba,
                        b6, x, y) %>% na.omit,
                        dat_fri_scr)

# create vector of X-variables for imputation
vars <- c('agb', 'gmvwl',
          'pcum8', 'ba',
          'b6', 'x', 'y')

# run_knn_fri function to get performance results
perf <- run_knn_fri(dat_fri_scr, vars, k = 5)

# run_knn_fri function to get imputed vs. observed values
nn_tab <- run_knn_fri_table(dat_fri_scr, vars, k = 5)

6.2.2 Results: Imputation Over FRI

Summary Stats:

# round values
perf %<>% mutate(value = round(value, 2))

# factor variable and metric categories to order
perf %<>% mutate(variable = factor(variable, levels = c('age', 'leading species', 
                                                        'second species', 'three func group class',
                                                        'five func group class', vars))) %>%
  mutate(metric = factor(metric, levels = c('rmsd (yrs)', 'mbe (yrs)', 'mae (yrs)', 'accuracy (%)', 'rrmsd (%)', 'rmbe (%)')))

# cast df
perf_cast <- dcast(perf, variable ~ metric)

# remove x and y
perf_cast %<>% filter(!(variable %in% c('x', 'y')))

# set NA to blank
perf_cast[is.na(perf_cast)] <- ''

# display results
knitr::kable(perf_cast, caption = "Imputation Performance of FRI Forest Stand Polygons", label = NA)

Imputation Performance of FRI Forest Stand Polygons
variable	rmsd (yrs)	mbe (yrs)	mae (yrs)	accuracy (%)	rrmsd (%)	rmbe (%)
age	20.08	0.17	14.75
leading species				64.47
second species				31.75
three func group class				78.61
five func group class				80.78
agb					2.5	-0.02
gmvwl					3.84	-0.28
pcum8					1.24	0.07
ba					2.24	0.05
b6					2.27	0.09

The mean bias error (MBE) of age is 0.17 years, indicating the imputed estimates of age are not skewed toward younger or older values. The mean absolute error (MAE) of age is 14.75 years, which is the average difference between the observed and imputed value.

Accuracy of leading species classification is 64.47%, and a much lower 31.75% for second leading species. Three and five functional group classification have respective accuracies of 78.61% and 80.78%. Functional group classification accuracies are higher than in the RMF.

Relative root mean squared difference (RRMSD) of the imputation attributes (avg, sd, rumple, zpcum8, and red_edge_2) is below 4% for all attributes. These are low values, which demonstrate that the imputation algorithm is finding optimal matches within the database of available FRI polygons.

Relative mean bias error (RMBE) of the imputation attributes is less than 1%, meaning that the nearest neighbor selections are not skewed toward positive or negative values of these attributes.

RRMSD/RMBE are not calculated for x and y because the coordinates do not represent a value scale.

We can also generate detailed confusion matrices of the imputed vs. observed species composition.

Confusion matrices:

# calculate 3 func group confusion matrix
# build accuracy table
accmat <- table("pred" = nn_tab$class3_nn, "ref" = nn_tab$class3)

# UA
ua <- diag(accmat) / rowSums(accmat) * 100

# PA
pa <- diag(accmat) / colSums(accmat) * 100

# OA
oa <- sum(diag(accmat)) / sum(accmat) * 100

# build confusion matrix
accmat_ext <- addmargins(accmat)
accmat_ext <- rbind(accmat_ext, "Users" = c(pa, NA))
accmat_ext <- cbind(accmat_ext, "Producers" = c(ua, NA, oa))
accmat_ext <- round(accmat_ext, 2)
dimnames(accmat_ext) <- list("Imputed" = colnames(accmat_ext),
                             "Observed" = rownames(accmat_ext))
class(accmat_ext) <- "table"

# display results
knitr::kable(accmat_ext %>% round, caption = "Confusion matrix of imputed vs. observed three functional group classification over FRI polygons. Rows are imputed values and columns are observed values.", label = NA)

Confusion matrix of imputed vs. observed three functional group classification over FRI polygons. Rows are imputed values and columns are observed values.
	Hardwood	Mixedwood	Softwood	Sum	Users
Hardwood	53011	5542	745	59298	89
Mixedwood	4227	14830	5210	24267	61
Softwood	1098	4781	11560	17439	66
Sum	58336	25153	17515	101004	NA
Producers	91	59	66	NA	79

# calculate 5 func group confusion matrix
# build accuracy table
accmat <- table("pred" = nn_tab$class5_nn, "ref" = nn_tab$class5)

# UA
ua <- diag(accmat) / rowSums(accmat) * 100

# PA
pa <- diag(accmat) / colSums(accmat) * 100

# OA
oa <- sum(diag(accmat)) / sum(accmat) * 100

# build confusion matrix
accmat_ext <- addmargins(accmat)
accmat_ext <- rbind(accmat_ext, "Users" = c(pa, NA))
accmat_ext <- cbind(accmat_ext, "Producers" = c(ua, NA, oa))
accmat_ext <- round(accmat_ext, 2)
dimnames(accmat_ext) <- list("Imputed" = colnames(accmat_ext),
                             "Observed" = rownames(accmat_ext))
class(accmat_ext) <- "table"

# display results
knitr::kable(accmat_ext %>% round, caption = "Confusion matrix of imputed vs. observed five functional group classification over FRI polygons. Rows are imputed values and columns are observed values.", label = NA)

Confusion matrix of imputed vs. observed five functional group classification over FRI polygons. Rows are imputed values and columns are observed values.
	Black Spruce Dominated	Hardwood	Jack Pine Dominated	Mixed Conifers	Mixedwood	Sum	Users
Black Spruce Dominated	549	61	3	348	217	1178	47
Hardwood	44	52851	17	319	5613	58844	90
Jack Pine Dominated	20	30	357	128	148	683	52
Mixed Conifers	483	316	104	3607	2248	6758	53
Mixedwood	336	5078	159	3739	24229	33541	72
Sum	1432	58336	640	8141	32455	101004	NA
Producers	38	91	56	44	75	NA	81

# calculate leading species confusion matrix
# create df of sp1
sp1 <- data.frame(obs = nn_tab$sp1 %>% as.factor,
                 est = nn_tab$sp1_nn %>% as.factor)

# make estimate levels match obs levels
levels(sp1$est) <- c(levels(sp1$est), levels(sp1$obs)[!(levels(sp1$obs) %in% levels(sp1$est))])
sp1$est <- factor(sp1$est, levels = levels(sp1$obs))

# create column of match or not
sp1$match <- sp1$obs == sp1$est

# build accuracy table
accmat <- table("pred" = sp1$est, "ref" = sp1$obs)

# UA
ua <- diag(accmat) / rowSums(accmat) * 100

# PA
pa <- diag(accmat) / colSums(accmat) * 100

# OA
oa <- sum(diag(accmat)) / sum(accmat) * 100

# build confusion matrix
accmat_ext <- addmargins(accmat)
accmat_ext <- rbind(accmat_ext, "Users" = c(pa, NA))
accmat_ext <- cbind(accmat_ext, "Producers" = c(ua, NA, oa))
accmat_ext <- round(accmat_ext, 2)
dimnames(accmat_ext) <- list("Imputed" = colnames(accmat_ext),
                             "Observed" = rownames(accmat_ext))
class(accmat_ext) <- "table"

# display results
kbl(accmat_ext %>% round, caption = "Confusion matrix of imputed vs. observed leading species over FRI polygons. Rows are imputed values and columns are observed values.", label = NA) %>%
  kable_paper() %>%
  scroll_box(width = "500px")

Confusion matrix of imputed vs. observed leading species over FRI polygons. Rows are imputed values and columns are observed values.
	AB	AG	AW	BD	BE	BF	BW	BY	CB	CE	CW	EW	HE	LA	MH	MR	MS	OR	OW	PB	PJ	PL	PO	PR	PS	PT	PW	SB	SW	Sum	Users
AB	133	0	0	0	0	16	4	19	0	0	4	0	5	2	84	52	0	16	0	0	0	0	0	2	1	23	11	7	4	383	35
AG	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	NaN
AW	0	0	2	0	0	0	0	0	0	0	1	0	0	0	9	6	0	1	0	0	0	0	0	0	0	0	0	0	0	19	11
BD	0	0	0	0	0	0	0	0	0	0	0	0	0	0	4	0	0	0	0	1	0	0	0	0	0	0	0	0	0	5	0
BE	0	0	0	0	0	0	0	0	0	0	0	0	0	0	5	0	0	2	0	0	0	0	0	0	0	0	0	0	0	7	0
BF	42	0	0	0	0	1886	258	99	0	1	125	0	238	86	206	202	0	27	0	1	15	6	3	50	60	494	310	578	443	5130	37
BW	10	0	0	0	0	197	578	53	0	0	11	0	111	4	160	86	0	35	0	1	28	8	4	15	6	424	252	42	70	2095	28
BY	19	0	0	0	1	69	44	337	0	0	6	0	52	0	454	63	0	12	0	2	0	1	2	2	2	53	12	19	44	1194	28
CB	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0
CE	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	NaN
CW	8	0	0	0	0	56	10	2	0	0	69	0	19	13	12	13	0	4	0	0	0	1	0	2	0	21	16	74	21	341	20
EW	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	NaN
HE	16	0	0	0	1	230	223	128	0	0	43	0	4177	5	446	122	2	107	1	1	2	6	2	40	6	281	609	114	121	6683	62
LA	2	0	0	0	0	39	1	0	0	0	10	0	3	33	8	3	0	1	0	0	1	0	0	2	0	13	10	41	10	177	19
MH	378	0	55	22	82	156	187	1566	11	0	9	0	613	8	34365	2281	93	2617	14	2	1	3	5	15	3	337	240	17	54	43134	80
MR	108	1	8	2	2	125	72	62	0	1	12	1	134	5	920	2086	19	739	14	1	2	4	5	9	5	187	362	22	37	4945	42
MS	0	0	0	0	1	0	0	0	0	0	0	0	0	0	18	10	5	2	1	0	0	0	0	0	0	0	0	0	0	37	14
OR	70	0	8	2	7	16	22	38	0	0	5	0	100	2	1283	828	11	2698	25	2	2	6	0	1	1	48	307	7	2	5491	49
OW	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0	0	2	0	0	0	0	0	0	0	0	2	0	0	6	0
PB	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	1	3	1	0	0	7	0
PJ	1	0	0	0	0	10	33	1	0	0	2	0	6	20	7	11	0	4	0	0	567	1	0	7	2	61	263	39	7	1042	54
PL	1	0	0	0	0	4	5	2	0	0	3	0	3	0	5	3	0	6	0	0	0	11	0	1	0	4	8	2	1	59	19
PO	0	0	0	0	0	0	5	1	0	0	0	0	1	1	3	2	0	0	0	0	0	1	10	1	0	11	3	1	2	42	24
PR	1	0	0	0	0	13	6	2	0	0	1	0	14	1	11	9	0	1	0	0	6	0	3	351	29	51	67	4	17	587	60
PS	0	0	0	0	0	15	4	1	0	0	0	0	7	1	1	5	0	0	0	0	2	0	1	16	23	11	10	1	4	102	23
PT	57	0	0	0	0	500	585	112	0	0	39	0	263	15	389	272	0	76	0	7	48	36	21	96	37	2918	699	90	190	6450	45
PW	44	0	2	0	0	492	549	39	0	0	64	0	802	49	277	654	1	601	3	9	307	35	4	276	19	1107	13245	132	172	18883	70
SB	19	0	1	0	0	531	62	43	1	0	192	0	122	130	89	30	0	14	0	0	13	6	1	15	10	97	79	1357	183	2995	45
SW	9	0	0	0	0	284	83	44	0	0	33	0	75	26	40	36	0	1	0	0	2	0	5	12	6	98	63	101	271	1189	23
Sum	918	1	76	26	94	4639	2731	2550	12	2	629	1	6745	401	38799	6775	131	6966	58	27	996	125	66	913	211	6242	16569	2648	1653	101004	NA
Producers	14	0	3	0	0	41	21	13	0	0	11	0	62	8	89	31	4	39	0	0	57	9	15	38	11	47	80	51	16	NA	64

6.3 Imputation from FRI to GRM

6.3.1 Run Algorithm

# load GRM screened polygons
load(str_c(out_dir, '/temp/dat_grm_scr.RData'))

# create data frame for grm and fri metrics used in imputation
dat_grm_imp <- dat_grm_scr %>% select(id, agb,
                                      gmvwl, pcum8, ba,
                                      b6, x, y) %>% na.omit
dat_fri_imp <- dat_fri_scr %>% select(agb,
                                      gmvwl, pcum8, ba,
                                      b6, x, y) %>% na.omit

# need to combine and scale all values together then separate again
dat_comb_scaled <- rbind(dat_grm_imp %>% select(-id), 
                         dat_fri_imp) %>% scale
dat_grm_scaled <- dat_comb_scaled[1:NROW(dat_grm_imp),]
dat_fri_scaled <- dat_comb_scaled[(NROW(dat_grm_imp)+1):(NROW(dat_grm_imp)+NROW(dat_fri_imp)),]

# run nearest neighbor imputation k = 5
nn <- nn2(dat_fri_scaled, dat_grm_scaled, k = 5)

# get nn indices
nni <- nn[[1]]

# add imputed attributes into GRM imputation data frame
for(i in seq_along(vars)){
  dat_grm_imp %<>% add_column(
    !!str_c(vars[i], '_imp') := apply(
      nni, 
      MARGIN = 1, 
      FUN = function(x){
        mean(dat_fri_imp[x, vars[i]])
      }
    ))
}

# create vector of target variables
tar_vars <- c('AGE2018', 'SP1', 'SP2',
              'class3', 'class5')
  
# add age and species variables to GRM data frame
for(i in seq_along(tar_vars)){
  if(i == 'AGE2018'){
    dat_grm_imp %<>% add_column(
      !!tar_vars[i] := apply(
        nni, 
        MARGIN = 1, 
        FUN = function(x){
          mean(dat_fri_scr[x, tar_vars[i]])
        }
      ))
  } else{
    dat_grm_imp %<>% add_column(
      !!tar_vars[i] := apply(
        nni, 
        MARGIN = 1, 
        FUN = function(x){
          getmode(dat_fri_scr[x, tar_vars[i]])
        }
      ))
  }
}

# update colnames
dat_grm_imp %<>% rename(age = AGE2018)

# add values back into main GRM data frame
dat_grm <- left_join(dat_grm, dat_grm_imp)

6.3.2 Results: Imputation X-Variable Performance

# calculate performance across imputation x-variables
  for(i in seq_along(vars)){
    if(i == 1){
      perf <- tibble(variable = vars[i],
                           metric = c('rrmsd (%)', 'rmbe (%)'),
                           value = c(rrmsd(dat_grm_imp[, vars[i]],
                         dat_grm_imp[, str_c(vars[i], '_imp')]),
                                 rmbe(dat_grm_imp[, vars[i]],
                         dat_grm_imp[, str_c(vars[i], '_imp')])))
    }else{
      perf %<>% add_row(variable = vars[i],
                           metric = c('rrmsd (%)', 'rmbe (%)'),
                           value = c(rrmsd(dat_grm_imp[, vars[i]],
                         dat_grm_imp[, str_c(vars[i], '_imp')]),
                                  rmbe(dat_grm_imp[, vars[i]],
                          dat_grm_imp[, str_c(vars[i], '_imp')])))
    }
  }

# round to two decimal places
perf %<>% mutate(value = round(value, 2))

# factor so table displays nicely
perf %<>% mutate(variable = factor(variable, levels = vars)) %>%
  mutate(metric = factor(metric, levels = c('rrmsd (%)', 'rmbe (%)')))

# cast df
perf_cast <- dcast(perf, variable ~ metric)

# remove x and y
perf_cast %<>% filter(!(variable %in% c('x', 'y')))

# display results of imputation rmsd
knitr::kable(perf_cast, caption = "Imputation X-Variable Performance between FRI and GRM Forest Stand Polygons", label = NA)

Imputation X-Variable Performance between FRI and GRM Forest Stand Polygons
variable	rrmsd (%)	rmbe (%)
agb	3.31	-0.06
gmvwl	4.10	-0.72
pcum8	1.47	-0.30
ba	2.68	-0.08
b6	1.82	-0.21

The RRMSD results are all below 5% and RMBE does not show bias toward negative of positive difference. These results are comparable to those found when conducting imputation over FRI polygons only.

6.3.3 Results: Distribution of Age

Spatial comparison:

# load GRM polygons
poly_grm <- vect(grm)

# add new data frame to polygons
values(poly_grm) <- dat_grm

# save grm polygon output
writeVector(poly_grm, str_c(out_dir, '/grm_imp.shp'), overwrite = T)

# load FRI polygons
poly_fri <- vect(fri)

# combine dat_fri with polygon attributes
dat_fri_poly <- left_join(as.data.frame(poly_fri), dat_fri)

# we should only compare the screened FRI polygons so set 
# other values to NA
dat_fri_poly$AGE2018[!(dat_fri_poly$OPI_ID %in% dat_fri_scr$OPI_ID)] <- NA
dat_fri_poly$class3[!(dat_fri_poly$OPI_ID %in% dat_fri_scr$OPI_ID)] <- NA
dat_fri_poly$class5[!(dat_fri_poly$OPI_ID %in% dat_fri_scr$OPI_ID)] <- NA

# re-input attributes into FRI polygons
values(poly_fri) <- dat_fri_poly
rm(dat_fri_poly)

# save fri polygon output with key values only in screened polygons
writeVector(poly_fri, str_c(out_dir, '/fri_scr.shp'), overwrite = T)

# create df to plot age
grm_sf <- st_as_sf(poly_grm)
fri_sf <- st_as_sf(poly_fri)

# cut dfs
grm_sf %<>% mutate(age_cut = cut(age, breaks = c(seq(0, 130, 10), 250)))
fri_sf %<>% mutate(age_cut = cut(AGE2018, breaks = c(seq(0, 130, 10), 250)))

# plot age
p1 <- ggplot(grm_sf) +
  geom_sf(mapping = aes(fill = age_cut), linewidth = 0.001) +
  coord_sf() +
  scale_fill_manual(values = viridis(14), name = 'Age', na.translate = F) +
  theme_void(base_size = 30) +
  ggtitle('Imputed Age of GRM \nSegmented Forest Stands') +
  theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(fri_sf) +
  geom_sf(mapping = aes(fill = age_cut), linewidth = 0.001) +
  coord_sf() +
  scale_fill_manual(values = viridis(14), name = 'Age', na.translate = F) +
  theme_void(base_size = 30) +
  ggtitle('Age of FRI \nForest Stands') +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1, p2, ncol = 1)

Imputed age values show a similar spatial distribution to observed age values at a broad scale.

Density plots:

# density plots of age
p1 <- ggplot(dat_grm, aes(x = age)) +
  geom_density(fill = 'grey') +
  geom_vline(aes(xintercept = median(age, na.rm = T)),
             linetype = "dashed",
             size = 0.6) +
  xlim(c(0,200)) +
  ylim(c(0, 0.025)) +
  theme_bw() +
  xlab('Age') +
  ylab('Density') +
  ggtitle('Imputed Age of GRM Segmented Forest Stands') +
  theme(text = element_text(size = 25),
        plot.title = element_text(size=30))

p2 <- ggplot(as.data.frame(poly_fri), aes(x = AGE2018)) +
  geom_density(fill = 'grey') +
  geom_vline(aes(xintercept = median(AGE2018, na.rm = T)),
             linetype = "dashed",
             size = 0.6) +
  xlim(c(0, 200)) +
  ylim(c(0, 0.025)) +
  theme_bw() +
  xlab('Age') +
  ylab('Density') +
  ggtitle('Age of FRI Forest Stands') +
  theme(text = element_text(size = 25),
        plot.title = element_text(size=30))

grid.arrange(p1, p2, ncol = 1)

The distribution of imputed age in GRM polygons closely matches that of observed age in FRI polygons. The median age values (dotted lines) are similar.

6.3.4 Results: Distribution of Species

Spatial patterns of three functional group classification:

# plot three func group classification
p1 <- ggplot(grm_sf) +
  geom_sf(mapping = aes(fill = class3), linewidth = 0.001) +
  coord_sf() +
  scale_fill_manual(values = c('#228833', '#aa3377', '#ccbb44'), 
                    name = 'Species Class', na.translate = F) +
  theme_void(base_size = 30) +
  ggtitle('Imputed Three Functional \nGroup Classification (GRM)') +
  theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(fri_sf) +
  geom_sf(mapping = aes(fill = class3), linewidth = 0.001) +
  coord_sf() +
  scale_fill_manual(values = c('#228833', '#aa3377', '#ccbb44'), 
                    name = 'Species Class', na.translate = F) +
  theme_void(base_size = 30) +
  ggtitle('Three Functional \nGroup Classification (FRI)') +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1, p2, ncol = 1)

We can observe a similar distribution of species classes.

Spatial patterns of five functional group classification:

# plot five func group classification
p1 <- ggplot(grm_sf) +
  geom_sf(mapping = aes(fill = class5), linewidth = 0.001) +
  coord_sf() +
  scale_fill_manual(values = c('#ccbb44', '#228833', '#4477aa',
                      '#ee6677', '#aa3377'), 
                    name = 'Species Class', na.translate = F) +
  theme_void(base_size = 30) +
  ggtitle('Imputed Five Functional \nGroup Classification (GRM)') +
  theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(fri_sf) +
  geom_sf(mapping = aes(fill = class5), linewidth = 0.001) +
  coord_sf() +
  scale_fill_manual(values = c('#ccbb44', '#228833', '#4477aa',
                      '#ee6677', '#aa3377'), 
                    name = 'Species Class', na.translate = F) +
  theme_void(base_size = 30) +
  ggtitle('Five Functional \nGroup Classification (FRI)') +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1, p2, ncol = 1)

Overall distribution of three functional group classification:

# create data frame for GRM 3 classes
dat_grm_c3 <- dat_grm %>% 
  tabyl(class3) %>%  
  filter(is.na(class3) == F) %>% 
  arrange(desc(class3)) %>%
  mutate(prop = n / sum(.$n)*100) %>%
  mutate(ypos = cumsum(prop) - 0.5*prop) %>%
  mutate(lbl = round(prop))

# create data frame for FRI 3 classes
dat_fri_c3 <- poly_fri %>% as.data.frame %>%
  tabyl(class3) %>%  
  filter(is.na(class3) == F) %>% 
  arrange(desc(class3)) %>%
  mutate(prop = n / sum(.$n)*100) %>%
  mutate(ypos = cumsum(prop) - 0.5*prop) %>%
  mutate(lbl = round(prop))

# plot
p1 <- ggplot(dat_grm_c3, aes(x = "", y = prop, fill = class3)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start = 0) +
  theme_void() +
  geom_text(aes(y = ypos, label = str_c(lbl, "%")), size = 15) +
  theme(legend.title = element_text(size = 30),
        legend.text = element_text(size = 20),
        legend.key.width = unit(2, 'cm'),
        plot.title = element_text(size=30)) +
  scale_fill_manual(values = c('#228833', '#aa3377', '#ccbb44')) +
  labs(fill = "") +
  ggtitle("Imputed Three Functional \nGroup Classification (GRM)")

p2 <- ggplot(dat_fri_c3, aes(x = "", y = prop, fill = class3)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start = 0) +
  theme_void() +
  geom_text(aes(y = ypos, label = str_c(lbl, "%")), size = 15) +
  theme(legend.title = element_text(size = 30),
        legend.text = element_text(size = 20),
        legend.key.width = unit(2, 'cm'),
        plot.title = element_text(size=30)) +
  scale_fill_manual(values = c('#228833', '#aa3377', '#ccbb44')) +
  labs(fill = "") +
  ggtitle("Three Functional Group \nClassification (FRI)")

grid.arrange(p1, p2, ncol = 2)

The distribution of hardwood is the same. The GRM data have slightly more softwood and slightly less mixedwood.

Overall distribution of five functional group classification:

# distribution of 5 species classes
# create data frame for GRM 5 classes
dat_grm_c5 <- dat_grm %>% 
  tabyl(class5) %>%  
  filter(is.na(class5) == F) %>% 
  arrange(desc(class5)) %>%
  mutate(prop = n / sum(.$n)*100) %>%
  mutate(ypos = cumsum(prop) - 0.5*prop) %>%
  mutate(lbl = round(prop))

# create data frame for FRI 5 classes
dat_fri_c5 <- poly_fri %>% as.data.frame %>%
  tabyl(class5) %>%  
  filter(is.na(class5) == F) %>% 
  arrange(desc(class5)) %>%
  mutate(prop = n / sum(.$n)*100) %>%
  mutate(ypos = cumsum(prop) - 0.5*prop) %>%
  mutate(lbl = round(prop))

# plot
p1 <- ggplot(dat_grm_c5, aes(x = "", y = prop, fill = class5)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start = 0) +
  theme_void() +
  geom_text(aes(y = ypos, label = str_c(lbl, "%")), size = 15) +
  theme(legend.title = element_text(size = 30),
        legend.text = element_text(size = 20),
        legend.key.width = unit(2, 'cm'),
        plot.title = element_text(size=30)) +
  scale_fill_manual(values = c('#ccbb44', '#228833', '#4477aa',
                                        '#ee6677', '#aa3377')) +
  labs(fill = "") +
  ggtitle("Imputed Five Functional \nGroup Classification (GRM)")

p2 <- ggplot(dat_fri_c5, aes(x = "", y = prop, fill = class5)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start = 0) +
  theme_void() +
  geom_text(aes(y = ypos, label = str_c(lbl, "%")), size = 15) +
  theme(legend.title = element_text(size = 30),
        legend.text = element_text(size = 20),
        legend.key.width = unit(2, 'cm'),
        plot.title = element_text(size=30)) +
  scale_fill_manual(values = c('#ccbb44', '#228833', '#4477aa',
                                        '#ee6677', '#aa3377')) +
  labs(fill = "") +
  ggtitle("Five Functional Group \nClassification (FRI)")

grid.arrange(p1, p2, ncol = 2)

The distributions are again very similar. The RMF five species classification does not apply to the FSF in general, due to the negligible amounts of black spruce and jack pine.