Update pretreatment.py

Code refactoring to solve deadlock issue and improve code organization
2022-01-29 10:41:49 -03:00
parent b6feac01e7
commit 5f0f4ad3e3
1 changed files with 125 additions and 186 deletions
@@ -1,201 +1,140 @@
-# modified from https://github.com/AbnerHqC/GaitSet/blob/master/pretreatment.py
+# This source is based on https://github.com/AbnerHqC/GaitSet/blob/master/pretreatment.py
-
+import argparse
 import logging
 import multiprocessing as mp
 import os
 import pickle
 from collections import defaultdict
 from functools import partial
 from pathlib import Path
 from typing import Tuple
 import cv2
 import numpy as np
-from warnings import warn
+from tqdm import tqdm
 from time import sleep
 import argparse
 import pickle
 from multiprocessing import Pool
 from multiprocessing import TimeoutError as MP_TimeoutError
 START = "START"
 FINISH = "FINISH"
 WARNING = "WARNING"
 FAIL = "FAIL"
-def boolean_string(s):
+def imgs2pickle(img_groups: Tuple, output_path: Path, img_size: int = 64, verbose: bool = False) -> None:
-    if s.upper() not in {'FALSE', 'TRUE'}:
+    """Reads a group of images and saves the data in pickle format.
-        raise ValueError('Not a valid boolean string')
+
-    return s.upper() == 'TRUE'
+    Args:
        img_groups (Tuple): Tuple of (sid, seq, view) and list of image paths.
        output_path (Path): Output path.
        img_size (int, optional): Image resizing size. Defaults to 64.
        verbose (bool, optional): Display debug info. Defaults to False.
    """    
    sinfo = img_groups[0]
    img_paths = img_groups[1]
    to_pickle = []
    for img_file in sorted(img_paths):
        if verbose:
            logging.debug(f'Reading sid {sinfo[0]}, seq {sinfo[1]}, view {sinfo[2]} from {img_file}')
        img = cv2.imread(str(img_file), cv2.IMREAD_GRAYSCALE)
        if img.sum() <= 10000:
            if verbose:
                logging.debug(f'Image sum: {img.sum()}')
            logging.warning(f'{img_file} has no data.')
            continue
        # Get the upper and lower points
        y_sum = img.sum(axis=1)
        y_top = (y_sum != 0).argmax(axis=0)
        y_btm = (y_sum != 0).cumsum(axis=0).argmax(axis=0)
        img = img[y_top: y_btm + 1, :]
        # As the height of a person is larger than the width,
        # use the height to calculate resize ratio.
        ratio = img.shape[1] // img.shape[0]
        img = cv2.resize(img, (img_size * ratio, img_size), interpolation=cv2.INTER_CUBIC)
        # Get the median of the x-axis and take it as the person's x-center.
        x_csum = img.sum(axis=0).cumsum()
        x_center = None
        for idx, csum in enumerate(x_csum):
            if csum > img.sum() / 2:
                x_center = idx
                break
        if not x_center:
            logging.warning(f'{img_file} has no center.')
            continue
        # Get the left and right points
        half_width = img_size // 2
        left = x_center - half_width
        right = x_center + half_width
        if left <= 0 or right >= img.shape[1]:
            left += half_width
            right += half_width
            _ = np.zeros((img.shape[0], half_width))
            img = np.concatenate([_, img, _], axis=1)
        to_pickle.append(img[:, left: right].astype('uint8'))
    if to_pickle:
        to_pickle = np.asarray(to_pickle)
        dst_path = os.path.join(output_path, *sinfo)
        os.makedirs(dst_path, exist_ok=True)
        pkl_path = os.path.join(dst_path, f'{sinfo[2]}.pkl')
        if verbose:
            logging.debug(f'Saving {pkl_path}...')
        pickle.dump(to_pickle, open(pkl_path, 'wb'))   
    if len(to_pickle) < 5:
        logging.warning(f'{sinfo} has less than 5 valid data.')
    logging.info(f'Saved {len(to_pickle)} valid frames to {pkl_path}.')
-parser = argparse.ArgumentParser(description='Test')
+def pretreat(input_path: Path, output_path: Path, img_size: int = 64, workers: int = 4, verbose: bool = False) -> None:
-parser.add_argument('--input_path', default='', type=str,
+    """Reads a dataset and saves the data in pickle format.
                    help='Root path of raw dataset.')
 parser.add_argument('--output_path', default='', type=str,
                    help='Root path for output.')
 parser.add_argument('--log_file', default='./pretreatment.log', type=str,
                    help='Log file path. Default: ./pretreatment.log')
 parser.add_argument('--log', default=False, type=boolean_string,
                    help='If set as True, all logs will be saved. '
                         'Otherwise, only warnings and errors will be saved.'
                         'Default: False')
 parser.add_argument('--worker_num', default=1, type=int,
                    help='How many subprocesses to use for data pretreatment. '
                         'Default: 1')
 parser.add_argument('--img_size', default=64, type=int,
                    help='image size')
 opt = parser.parse_args()
-INPUT_PATH = opt.input_path
+    Args:
-OUTPUT_PATH = opt.output_path
+        input_path (Path): Dataset root path.
-IF_LOG = opt.log
+        output_path (Path): Output path.
-LOG_PATH = opt.log_file
+        img_size (int, optional): Image resizing size. Defaults to 64.
-WORKERS = opt.worker_num
+        workers (int, optional): Number of thread workers. Defaults to 4.
        verbose (bool, optional): Display debug info. Defaults to False.
    """
    img_groups = defaultdict(list)
    logging.info(f'Listing {input_path}')
    total_files = 0
    for img_path in input_path.rglob('*.png'):
        if verbose:
            logging.debug(f'Adding {img_path}')
        *_, sid, seq, view, _ = img_path.as_posix().split(os.sep)
        img_groups[(sid, seq, view)].append(img_path)
        total_files += 1
-T_H = opt.img_size
+    logging.info(f'Total files listed: {total_files}')
 T_W = opt.img_size
-def log2str(pid, comment, logs):
+    progress = tqdm(total=len(img_groups), desc='Pretreating', unit='folder')
    str_log = ''
    if type(logs) is str:
        logs = [logs]
    for log in logs:
        str_log += "# JOB %d : --%s-- %s\n" % (
            pid, comment, log)
    return str_log
-def log_print(pid, comment, logs):
+    with mp.Pool(workers) as pool:
-    str_log = log2str(pid, comment, logs)
+        logging.info(f'Start pretreating {input_path}')
-    if comment in [WARNING, FAIL]:
+        for _ in pool.imap_unordered(partial(imgs2pickle, output_path=output_path, img_size=img_size, verbose=verbose), img_groups.items()):
-        with open(LOG_PATH, 'a') as log_f:
+            progress.update(1)
-            log_f.write(str_log)
+    logging.info('Done')
    if comment in [START, FINISH]:
        if pid % 500 != 0:
            return
    print(str_log, end='')
 def cut_img(img, seq_info, frame_name, pid):
    # A silhouette contains too little white pixels
    # might be not valid for identification.
    if img.sum() <= 10000:
        message = 'seq:%s, frame:%s, no data, %d.' % (
            '-'.join(seq_info), frame_name, img.sum())
        warn(message)
        log_print(pid, WARNING, message)
        return None
    # Get the top and bottom point
    y = img.sum(axis=1)
    y_top = (y != 0).argmax(axis=0)
    y_btm = (y != 0).cumsum(axis=0).argmax(axis=0)
    img = img[y_top:y_btm + 1, :]
    # As the height of a person is larger than the width,
    # use the height to calculate resize ratio.
    _r = img.shape[1] / img.shape[0]
    _t_w = int(T_H * _r)
    img = cv2.resize(img, (_t_w, T_H), interpolation=cv2.INTER_CUBIC)
    # Get the median of x axis and regard it as the x center of the person.
    sum_point = img.sum()
    sum_column = img.sum(axis=0).cumsum()
    x_center = -1
    for i in range(sum_column.size):
        if sum_column[i] > sum_point / 2:
            x_center = i
            break
    if x_center < 0:
        message = 'seq:%s, frame:%s, no center.' % (
            '-'.join(seq_info), frame_name)
        warn(message)
        log_print(pid, WARNING, message)
        return None
    h_T_W = int(T_W / 2)
    left = x_center - h_T_W
    right = x_center + h_T_W
    if left <= 0 or right >= img.shape[1]:
        left += h_T_W
        right += h_T_W
        _ = np.zeros((img.shape[0], h_T_W))
        img = np.concatenate([_, img, _], axis=1)
    img = img[:, left:right]
    return img.astype('uint8')
-def cut_pickle(seq_info, pid):
+if __name__ == '__main__':
-    seq_name = '-'.join(seq_info)
+    parser = argparse.ArgumentParser(description='OpenGait dataset pretreatment module.')
-    log_print(pid, START, seq_name)
+    parser.add_argument('-r', '--root_path', default='', type=str, help='Root path of raw dataset.')
-    seq_path = os.path.join(INPUT_PATH, *seq_info)
+    parser.add_argument('-o', '--output_path', default='', type=str, help='Output path of pickled dataset.')
-    out_dir = os.path.join(OUTPUT_PATH, *seq_info)
+    parser.add_argument('-l', '--log_file', default='./pretreatment.log', type=str, help='Log file path. Default: ./pretreatment.log')
-    frame_list = os.listdir(seq_path)
+    parser.add_argument('-n', '--n_workers', default=4, type=int, help='Number of thread workers. Default: 4')
-    frame_list.sort()
+    parser.add_argument('-i', '--img_size', default=64, type=int, help='Image resizing size. Default 64')
-    count_frame = 0
+    parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Display debug info.')
-    all_imgs = []
+    args = parser.parse_args()
    view = seq_info[-1]
    for _frame_name in frame_list:
        frame_path = os.path.join(seq_path, _frame_name)
        img = cv2.imread(frame_path)[:, :, 0]
        img = cut_img(img, seq_info, _frame_name, pid)
        if img is not None:
            # Save the cut img
            all_imgs.append(img)
            count_frame += 1
-    all_imgs = np.asarray(all_imgs)
+    logging.basicConfig(level=logging.INFO, filename=args.log_file, filemode='w', format='[%(asctime)s - %(levelname)s]: %(message)s')
-    if count_frame > 0:
+    if args.verbose:
-        os.makedirs(out_dir)
+        logging.getLogger().setLevel(logging.DEBUG)
-        all_imgs_pkl = os.path.join(out_dir, '{}.pkl'.format(view))
+        logging.info('Verbose mode is on.')
-        pickle.dump(all_imgs, open(all_imgs_pkl, 'wb'))    
+        for k, v in args.__dict__.items():
            logging.debug(f'{k}: {v}')
-    # Warn if the sequence contains less than 5 frames
+    pretreat(Path(args.root_path), Path(args.output_path), args.n_workers, args.img_size, args.verbose)
    if count_frame < 5:
        message = 'seq:%s, less than 5 valid data.' % (
            '-'.join(seq_info))
        warn(message)
        log_print(pid, WARNING, message)
    log_print(pid, FINISH,
              'Contain %d valid frames. Saved to %s.'
              % (count_frame, out_dir))
 pool = Pool(WORKERS)
 results = list()
 pid = 0
 print('Pretreatment Start.\n'
      'Input path: %s\n'
      'Output path: %s\n'
      'Log file: %s\n'
      'Worker num: %d' % (
          INPUT_PATH, OUTPUT_PATH, LOG_PATH, WORKERS))
 id_list = os.listdir(INPUT_PATH)
 id_list.sort()
 # Walk the input path
 for _id in id_list:
    seq_type = os.listdir(os.path.join(INPUT_PATH, _id))
    seq_type.sort()
    for _seq_type in seq_type:
        view = os.listdir(os.path.join(INPUT_PATH, _id, _seq_type))
        view.sort()
        for _view in view:
            seq_info = [_id, _seq_type, _view]
            out_dir = os.path.join(OUTPUT_PATH, *seq_info)
            # os.makedirs(out_dir)
            results.append(
                pool.apply_async(
                    cut_pickle,
                    args=(seq_info, pid)))
            sleep(0.02)
            pid += 1
 pool.close()
 unfinish = 1
 while unfinish > 0:
    unfinish = 0
    for i, res in enumerate(results):
        try:
            res.get(timeout=0.1)
        except Exception as e:
            if type(e) == MP_TimeoutError:
                unfinish += 1
                continue
            else:
                print('\n\n\nERROR OCCUR: PID ##%d##, ERRORTYPE: %s\n\n\n',
                      i, type(e))
                raise e
 pool.join()