swiss-territorial-data-lab · cleherny · Aug 22, 2023 · Aug 14, 2023 · Aug 15, 2023 · Aug 22, 2023
diff --git a/config/config_flai.yaml b/config/config_flai.yaml
@@ -1,6 +1,7 @@
 assess_flai.py:  
-  working_dir: C:\Users\gwena\Documents\STDL\2_En_cours\occupation-toitures\02_Data
+  working_dir: .
   method: one-to-many       # one-to-one or one-to-many
-  tiles: initial/LiDAR/Emprises tuiles Lidar 2019.shp
-  gt: processed\manual_GT\merged_occupied_surface.shp
-  detection: processed\tests\flai\2488500_1116500.shp
+  tiles: input/lidar_intensity/LiDAR/2019/Tuiles/Emprises tuiles Lidar 2019.shp
+  gt: input/GT/merged_occupied_surface.shp
+  detection: input/flai/2488500_1116500.shp
+  epsg: 2056
diff --git a/scripts/assess_results/assess_flai.py b/scripts/assess_results/assess_flai.py
@@ -9,7 +9,8 @@
 #      Copyright (c) 2020 Republic and Canton of Geneva
 #
 
-import os, sys
+import os
+import sys
 from loguru import logger
 from tqdm import tqdm
 from yaml import load, FullLoader
@@ -19,9 +20,9 @@
 
 sys.path.insert(1, 'scripts')
 import functions.fct_metrics as metrics
-import functions.fct_misc as fct_misc
+import functions.fct_misc as misc
 
-logger=fct_misc.format_logger(logger)
+logger = misc.format_logger(logger)
 
 # Argument and parameter specification
 logger.info(f"Using config_flai.yaml as config file.")
@@ -36,14 +37,15 @@
 
 GT = cfg['gt']
 DETECTION = cfg['detection']
+EPSG = cfg['epsg']
 
-TILE_NAME=os.path.basename(DETECTION).split('.')[0]
-TILES=cfg['tiles']
+TILE_NAME = os.path.basename(DETECTION).split('.')[0]
+TILES = cfg['tiles']
 
 os.chdir(WORKING_DIR)
 
-OUTPUT_DIR='final/flai_metrics'
-fct_misc.ensure_dir_exists(OUTPUT_DIR)
+OUTPUT_DIR = 'final/flai_metrics'
+misc.ensure_dir_exists(OUTPUT_DIR)
 
 written_files = {}
 
@@ -53,11 +55,15 @@
 gdf_detec = gpd.read_file(DETECTION)
 gdf_detec['ID_DET'] = gdf_detec['FID']
 gdf_detec = gdf_detec.rename(columns={"area": "area_DET"})
+gdf_detec = gdf_detec.to_crs(EPSG)
 logger.info(f"Read detection file: {gdf_detec.shape[0]} shapes")
 
 gdf_gt = gpd.read_file(GT)
 gdf_gt['ID_GT'] = gdf_gt['fid']
 gdf_gt = gdf_gt.rename(columns={"area": "area_GT"})
+gdf_gt = gdf_gt.to_crs(EPSG)
+
+misc.test_crs(gdf_detec.crs, gdf_gt.crs)
 
 if gdf_gt['ID_GT'].isnull().values.any():
     logger.error('Some labels have a null identifier.')
@@ -67,110 +73,112 @@
     sys.exit(1)
 
 if TILES:
-    tiles=gpd.read_file(TILES)
-    tile=tiles[tiles['fme_basena']==TILE_NAME]
-    gdf_gt=gdf_gt.overlay(tile)
+    tiles = gpd.read_file(TILES)
+    tile = tiles[tiles['fme_basena'] == TILE_NAME]
+    gdf_gt = gdf_gt.overlay(tile)
 
-nbr_labels=gdf_gt.shape[0]
+nbr_labels = gdf_gt.shape[0]
 logger.info(f"Read GT file: {nbr_labels} shapes")
 
 
 logger.info(f"Metrics computation")
-if METHOD=='one-to-one':
+if METHOD == 'one-to-one':
     logger.info('Using the one-to-one method.')
-elif METHOD=='one-to-many':
+elif METHOD == 'one-to-many':
     logger.info('Using one-to-many method.')
 else:
-    logger.warning('Unknown method, defaulting to one-to-one.')
+    logger.warning('Unknown method, the default value is one-to-one.')
 
-best_f1=0
-for threshold in tqdm([i/100 for i in range(10 ,100, 5)], desc='Search for the best threshold on the IoU'):
+best_f1 = 0
+for threshold in tqdm([i / 100 for i in range(10, 100, 5)], desc='Search for the best threshold on the IoU'):
 
     tp_gdf_loop, fp_gdf_loop, fn_gdf_loop = metrics.get_fractional_sets(gdf_detec, gdf_gt, iou_threshold=threshold, method=METHOD)
 
     # Compute metrics
     precision, recall, f1 = metrics.get_metrics(tp_gdf_loop, fp_gdf_loop, fn_gdf_loop)
 
-    if f1 > best_f1 or threshold==0:
-        tp_gdf=tp_gdf_loop
-        fp_gdf=fp_gdf_loop
-        fn_gdf=fn_gdf_loop
+    if f1 > best_f1 or threshold == 0:
+        tp_gdf = tp_gdf_loop
+        fp_gdf = fp_gdf_loop
+        fn_gdf = fn_gdf_loop
 
-        best_precision=precision
-        best_recall=recall
-        best_f1=f1
+        best_precision = precision
+        best_recall = recall
+        best_f1 = f1
 
-        best_threshold=threshold
+        best_threshold = threshold
 
-logger.info(f'The best threshold for the IoU is {best_threshold} in regard to the F1 score.')
+logger.info(f'The best threshold for the IoU is {best_threshold} for the F1 score.')
 
 
 TP = tp_gdf.shape[0]
 FP = fp_gdf.shape[0]
 FN = fn_gdf.shape[0]
 
-if METHOD=='one-to-many':
-    tp_with_duplicates=tp_gdf.copy()
-    dissolved_tp_gdf=tp_with_duplicates.dissolve(by=['ID_DET'], as_index=False)
+if METHOD == 'one-to-many':
+    tp_with_duplicates = tp_gdf.copy()
+    dissolved_tp_gdf = tp_with_duplicates.dissolve(by=['ID_DET'], as_index=False)
 
-    geom1 = dissolved_tp_gdf.geometry.values.tolist()
-    geom2 = dissolved_tp_gdf['geom_GT'].values.tolist()
+    geom_DET = dissolved_tp_gdf.geometry.values.tolist()
+    geom_GT = dissolved_tp_gdf['geom_GT'].values.tolist()
     iou = []
-    for (i, ii) in zip(geom1, geom2):
+    for (i, ii) in zip(geom_DET, geom_GT):
         iou.append(metrics.intersection_over_union(i, ii))
     dissolved_tp_gdf['IOU'] = iou
 
-    tp_gdf=dissolved_tp_gdf.copy()
+    tp_gdf = dissolved_tp_gdf.copy()
 
-    logger.info(f'{tp_with_duplicates.shape[0]-tp_gdf.shape[0]} labels are under a shared predictions with at least one other label.')
+    logger.info(f'{tp_with_duplicates.shape[0]-tp_gdf.shape[0]} labels share predictions with at least one other label.')
 
 logger.info(f"   TP = {TP}, FP = {FP}, FN = {FN}")
 logger.info(f"   precision = {best_precision:.2f}, recall = {best_recall:.2f}, f1 = {best_f1:.2f}")
 logger.info(f" - Compute mean Jaccard index")
-if TP!=0:
+
+if TP != 0:
     iou_average = tp_gdf['IOU'].mean()
     logger.info(f"   IOU average = {iou_average:.2f}")
 
 
 nbr_tagged_labels = TP + FN
-filename=os.path.join(OUTPUT_DIR, 'problematic_objects.gpkg')
+filename = os.path.join(OUTPUT_DIR, 'problematic_objects.gpkg')
+
 if os.path.exists(filename):
     os.remove(filename)
 if nbr_labels != nbr_tagged_labels:
     logger.error(f'There are {nbr_labels} labels in input and {nbr_tagged_labels} labels in output.')
     logger.info(f'The list of the problematic labels in exported to {filename}.')
 
     if nbr_labels > nbr_tagged_labels:
-        tagged_labels=tp_gdf['ID_GT'].unique().tolist() + fn_gdf['ID_GT'].unique().tolist()
+        tagged_labels = tp_gdf['ID_GT'].unique().tolist() + fn_gdf['ID_GT'].unique().tolist()
 
-        untagged_gt_gdf=gdf_gt[~gdf_gt['ID_GT'].isin(tagged_labels)]
+        untagged_gt_gdf = gdf_gt[~gdf_gt['ID_GT'].isin(tagged_labels)]
         untagged_gt_gdf.drop(columns=['geom_GT', 'OBSTACLE'], inplace=True)
 
-        layer_name='missing_label_tags'
+        layer_name = 'missing_label_tags'
         untagged_gt_gdf.to_file(filename, layer=layer_name, index=False)
 
     elif nbr_labels < nbr_tagged_labels:
-        all_tagged_labels_gdf=pd.concat([tp_gdf, fn_gdf])
+        all_tagged_labels_gdf = pd.concat([tp_gdf, fn_gdf])
 
-        duplicated_id_gt=all_tagged_labels_gdf.loc[all_tagged_labels_gdf.duplicated(subset=['ID_GT']), 'ID_GT'].unique().tolist()
-        duplicated_labels=all_tagged_labels_gdf[all_tagged_labels_gdf['ID_GT'].isin(duplicated_id_gt)]
+        duplicated_id_gt = all_tagged_labels_gdf.loc[all_tagged_labels_gdf.duplicated(subset=['ID_GT']), 'ID_GT'].unique().tolist()
+        duplicated_labels = all_tagged_labels_gdf[all_tagged_labels_gdf['ID_GT'].isin(duplicated_id_gt)]
         duplicated_labels.drop(columns=['geom_GT', 'OBSTACLE', 'geom_DET', 'index_right', 'fid', 'FID', 'fme_basena'], inplace=True)
 
-        layer_name='duplicated_label_tags'
+        layer_name = 'duplicated_label_tags'
         duplicated_labels.to_file(filename, layer=layer_name, index=False)
 
-    written_files[filename]=layer_name
+    written_files[filename] = layer_name
 
 
 # Set the final dataframe with tagged prediction
 logger.info(f"Set the final dataframe")
 
 tagged_preds_gdf = pd.concat([tp_gdf, fp_gdf, fn_gdf])
-tagged_preds_gdf = tagged_preds_gdf.drop(['index_right', 'geom_GT', 'FID', 'fid', 'OBSTACLE', 'geom_DET'], axis = 1)
+tagged_preds_gdf = tagged_preds_gdf.drop(['index_right', 'geom_GT', 'FID', 'fid', 'geom_DET'], axis = 1)
 
 feature_path = os.path.join(OUTPUT_DIR, f'tagged_predictions.gpkg')
 tagged_preds_gdf.to_file(feature_path, driver='GPKG', index=False, layer=TILE_NAME + '_tags')
-written_files[feature_path]=TILE_NAME + '_tags'
+written_files[feature_path] = TILE_NAME + '_tags'
 
 print()
 logger.success("The following files were written. Let's check them out!")

diff --git a/scripts/functions/fct_metrics.py b/scripts/functions/fct_metrics.py
@@ -2,15 +2,18 @@
 import pandas as pd
 
 
-def intersection_over_union(pol1_xy, pol2_xy):
-    # Define each polygon
-    polygon1_shape = pol1_xy
-    polygon2_shape = pol2_xy
+def intersection_over_union(polygon1_xy, polygon2_xy):
+    '''
+    Compute IoU value for a pair of shape.
 
-    # print(polygon1_shape, polygon2_shape)
+    - polygon1_xy, polygon2_xy: shape geometry
+    return: IoU of the 2 provided geometries
+    '''
+
     # Calculate intersection and union, and tne IOU
-    polygon_intersection = polygon1_shape.intersection(polygon2_shape).area
-    polygon_union = polygon1_shape.area + polygon2_shape.area - polygon_intersection
+    polygon_intersection = polygon1_xy.intersection(polygon2_xy).area
+    polygon_union = polygon1_xy.area + polygon2_xy.area - polygon_intersection
+
     return round(polygon_intersection / polygon_union, 3)
 
 
@@ -19,7 +22,7 @@ def apply_iou_threshold_one_to_one(tp_gdf_ini, threshold=0):
     Apply the IoU threshold on the TP detection to only keep the ones with sufficient intersection over union.
     Each detection can only correspond to one label.
 
-    - tp_gdf_ini: geodataframe of the potiential true positive detection
+    - tp_gdf_ini: geodataframe of the potential true positive detection
     - threshold: threshold to apply on the IoU
     return: the tp geodataframe and the geodataframe of the fp due to a low IoU
     '''
@@ -47,20 +50,20 @@ def apply_iou_threshold_one_to_many(tp_gdf_ini, threshold=0):
     '''
 
     # Compare the global IOU of the detection based on all the matching labels
-    sum_detections_gdf=tp_gdf_ini.groupby(['ID_DET'])['IOU'].sum().reset_index()
-    true_detections_gdf=sum_detections_gdf[sum_detections_gdf['IOU']>threshold]
-    true_detections_index=true_detections_gdf['ID_DET'].unique().tolist()
+    sum_detections_gdf = tp_gdf_ini.groupby(['ID_DET'])['IOU'].sum().reset_index()
+    true_detections_gdf = sum_detections_gdf[sum_detections_gdf['IOU'] > threshold]
+    true_detections_index = true_detections_gdf['ID_DET'].unique().tolist()
 
     # Check that the label is at least 25% under the prediction.
-    tp_gdf_ini['label_in_pred']=round(tp_gdf_ini['geom_GT'].intersection(tp_gdf_ini['geom_DET']).area/tp_gdf_ini['geom_GT'].area, 3)
-    tp_gdf_temp=tp_gdf_ini[(tp_gdf_ini['ID_DET'].isin(true_detections_index)) & (tp_gdf_ini['label_in_pred'] > 0.25)]
+    tp_gdf_ini['label_in_pred'] = round(tp_gdf_ini['geom_GT'].intersection(tp_gdf_ini['geom_DET']).area / tp_gdf_ini['geom_GT'].area, 3)
+    tp_gdf_temp = tp_gdf_ini[(tp_gdf_ini['ID_DET'].isin(true_detections_index)) & (tp_gdf_ini['label_in_pred'] > 0.25)]
 
     # For each label, only keep the pred with the best IOU.
     sorted_tp_gdf_temp = tp_gdf_temp.sort_values(by='IOU')
-    tp_gdf=sorted_tp_gdf_temp.drop_duplicates(['ID_GT'], keep='last', ignore_index=True)
-    id_det_tp=tp_gdf['ID_DET'].unique().tolist()
+    tp_gdf = sorted_tp_gdf_temp.drop_duplicates(['ID_GT'], keep='last', ignore_index=True)
+    id_det_tp = tp_gdf['ID_DET'].unique().tolist()
 
-    fp_gdf_temp=tp_gdf_ini[~tp_gdf_ini['ID_DET'].isin(id_det_tp)]
+    fp_gdf_temp = tp_gdf_ini[~tp_gdf_ini['ID_DET'].isin(id_det_tp)]
     fp_gdf_temp = fp_gdf_temp.groupby(['ID_DET'], group_keys=False).apply(lambda g:g[g.IOU == g.IOU.max()])
 
     return tp_gdf, fp_gdf_temp
@@ -98,14 +101,14 @@ def get_fractional_sets(preds_gdf, labels_gdf, iou_threshold=0.1, method='one-to
     tp_gdf_temp = left_join[left_join.ID_GT.notnull()].copy()
 
     # IOU computation between GT geometry and Detection geometry
-    geom1 = tp_gdf_temp['geom_DET'].to_numpy()
-    geom2 = tp_gdf_temp['geom_GT'].to_numpy()
+    geom_DET = tp_gdf_temp['geom_DET'].to_numpy()
+    geom_GT = tp_gdf_temp['geom_GT'].to_numpy()
     iou = []
-    for (i, ii) in zip(geom1, geom2):
+    for (i, ii) in zip(geom_DET, geom_GT):
         iou.append(intersection_over_union(i, ii))
     tp_gdf_temp['IOU'] = iou
 
-    if method=='one-to-many':
+    if method == 'one-to-many':
         tp_gdf, fp_gdf_temp = apply_iou_threshold_one_to_many(tp_gdf_temp, iou_threshold)
     else:
         tp_gdf, fp_gdf_temp = apply_iou_threshold_one_to_one(tp_gdf_temp, iou_threshold)
@@ -118,13 +121,13 @@ def get_fractional_sets(preds_gdf, labels_gdf, iou_threshold=0.1, method='one-to
     # FALSE NEGATIVES -> objects that have been missed by the algorithm
     right_join = gpd.sjoin(labels_gdf, preds_gdf, how='left', predicate='intersects', lsuffix='left', rsuffix='right')
 
-    id_gt_tp=tp_gdf['ID_GT'].unique().tolist()
-    suppressed_tp=tp_gdf_temp[~tp_gdf_temp['ID_GT'].isin(id_gt_tp)]
+    id_gt_tp = tp_gdf['ID_GT'].unique().tolist()
+    suppressed_tp = tp_gdf_temp[~tp_gdf_temp['ID_GT'].isin(id_gt_tp)]
     id_gt_filter = suppressed_tp['ID_GT'].unique().tolist()
 
-    fn_too_low_hit_gdf = right_join[right_join['ID_GT'].isin(id_gt_filter)]
-    fn_no_hit_gdf = right_join[right_join.ID_DET.isna()].copy()
-    fn_gdf = pd.concat([fn_no_hit_gdf, fn_too_low_hit_gdf])
+    fn_low_overlap_gdf = right_join[right_join['ID_GT'].isin(id_gt_filter)]
+    fn_no_overlap_gdf = right_join[right_join.ID_DET.isna()].copy()
+    fn_gdf = pd.concat([fn_no_overlap_gdf, fn_low_overlap_gdf])
 
     fn_gdf.drop_duplicates(subset=['ID_GT'], inplace=True)
 
@@ -148,6 +151,6 @@ def get_metrics(tp_gdf, fp_gdf, fn_gdf):
 
     precision = TP / (TP + FP)
     recall    = TP / (TP + FN)
-    f1 = 2*precision*recall/(precision+recall)
+    f1 = 2*precision*recall / (precision+recall)
 
     return precision, recall, f1
diff --git a/scripts/functions/fct_misc.py b/scripts/functions/fct_misc.py
@@ -23,21 +23,22 @@ def format_logger(logger):
             level="ERROR")
     return logger
 
-def test_crs(crs1, crs2 = "EPSG:2056"):
+def test_crs(crs1, crs2="EPSG:2056"):
     '''
     Take the crs of two dataframes and compare them. If they are not the same, stop the script.
     '''
     if isinstance(crs1, gpd.GeoDataFrame):
-        crs1=crs1.crs
+        crs1 = crs1.crs
     if isinstance(crs2, gpd.GeoDataFrame):
-        crs2=crs2.crs
+        crs2 = crs2.crs
 
     try:
         assert(crs1 == crs2), f"CRS mismatch between the two files ({crs1} vs {crs2})."
     except Exception as e:
         print(e)
         sys.exit(1)
 
+
 def ensure_dir_exists(dirpath):
     '''
     Test if a directory exists. If not, make it.