ADD online demo for LoFTR.

4 years ago · 8a6eabeaa3
parent 63818b0287
commit 8a6eabeaa3
4 changed files with 334 additions and 14 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 .vscode/
+__pycache__/
 *.pyc
 *.DS_Store
 *.swp
@ -9,4 +10,7 @@ tmp.*
 logs/
 weights/
 dump/
-src/loftr/utils/superglue.py
+demo/*.mp4
+demo/demo_images/
+src/loftr/utils/superglue.py
+demo/utils.py
--- a/README.md
+++ b/README.md
@ -15,12 +15,12 @@ In the meanwhile, discussions about the paper are welcomed in the [discussion pa

 - [x] Inference code and pretrained models (DS and OT) (2021-4-7)
 - [x] Code for reproducing the test-set results (2021-4-7)
- [ ] Webcam demo to reproduce the result shown in the GIF above (expected 2021-4-13)
+- [x] Webcam demo to reproduce the result shown in the GIF above (2021-4-13)
 - [ ] Training code and training data preparation (expected 2021-6-10)

 ## Installation
 ```shell
-# For full pytorch-lightning trainer features
+# For full pytorch-lightning trainer features (recommended)
 conda env create -f environment.yaml
 conda activate loftr

@ -33,7 +33,8 @@ We provide the [download link](https://drive.google.com/drive/folders/1DOcOPZb3-
  - the megadepth-1500-testset (~600MB).
  - 4 pretrained models of indoor-ds, indoor-ot, outdoor-ds and outdoor-ot (each ~45MB).

-By now, the LoFTR-DS model is ready to go!
+By now, the environment is all set and the LoFTR-DS model is ready to go! 
+If you want to run LoFTR-OT, some extra steps are needed:

 <details>
  <summary>[Requirements for LoFTR-OT]</summary>
@ -71,7 +72,55 @@ By now, the LoFTR-DS model is ready to go!

 </details>

-An example is in the `notebooks/demo_single_pair.ipynb`.
+An example is given in `notebooks/demo_single_pair.ipynb`.
+
+### Online demo
+Run the online demo with a webcam to reproduce the result shown in the GIF above.
+```bash
+cd demo
+./run_demo.sh
+```
+<details>
+  <summary>[run_demo.sh]</summary>
+
+  ```bash
+  #!/bin/bash
+  set -e
+  # set -x
+
+  if [ ! -f utils.py ]; then
+      echo "Downloading utils.py from the SuperGlue repo."
+      echo "We cannot provide this file directly due to its strict licence."
+      wget https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/master/models/utils.py
+  fi
+
+  # Use webcam 0 as input source. 
+  input=0
+  # or use a pre-recorded video given the path.
+  # input=/home/sunjiaming/Downloads/scannet_test/$scene_name.mp4
+
+  # Toggle indoor/outdoor model here.
+  model_ckpt=../weights/indoor_ds.ckpt
+  # model_ckpt=../weights/outdoor_ds.ckpt
+
+  # Optionally assign the GPU ID.
+  # export CUDA_VISIBLE_DEVICES=0
+
+  echo "Running LoFTR demo.."
+  eval "$(conda shell.zsh hook)"
+  conda activate loftr
+  python demo_loftr.py --weight $model_ckpt --input $input
+  # To save the input video and output match visualizations.
+  # python demo_loftr.py --weight $model_ckpt --input $input --save_video --save_input
+
+  # Running on remote GPU servers with no GUI.
+  # Save images first.
+  # python demo_loftr.py --weight $model_ckpt --input $input --no_display --output_dir="./demo_images/"
+  # Then convert them to a video.
+  # ffmpeg -framerate 15 -pattern_type glob -i '*.png' -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
+
+  ```
+</details>

 ### Reproduce the testing results with pytorch-lightning

@ -84,14 +133,12 @@ bash ./scripts/reproduce_test/indoor_ds.sh
 python test.py configs/data/scannet_test_1500.py configs/loftr/loftr_ds.py --ckpt_path weights/indoor_ds.ckpt --profiler_name inference --gpus=1 --accelerator="ddp"
 ```

-For visualizing the dump results, please refer to `notebooks/visualize_dump_results.ipynb`.
+For visualizing the results, please refer to `notebooks/visualize_dump_results.ipynb`.

 <br/>



-
-
 ## Citation

 If you find this code useful for your research, please use the following BibTeX entry.
@ -100,16 +147,11 @@ If you find this code useful for your research, please use the following BibTeX
@article{sun2021loftr,
  title={{LoFTR}: Detector-Free Local Feature Matching with Transformers},
  author={Sun, Jiaming and Shen, Zehong and Wang, Yuang and Bao, Hujun and Zhou, Xiaowei},
-  journal={CVPR},
+  journal={{CVPR}},
  year={2021}
 }
 ```

-<!-- ## Acknowledgment
-
-This repo is built based on the Mask R-CNN implementation from [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark), and we also use the pretrained Stereo R-CNN weight from [here](https://drive.google.com/file/d/1rZ5AsMms7-oO-VfoNTAmBFOr8O2L0-xt/view?usp=sharing) for initialization. -->
-
-
 ## Copyright
 This work is affiliated with ZJU-SenseTime Joint Lab of 3D Vision, and its intellectual property belongs to SenseTime Group Ltd.

--- a/demo/demo_loftr.py
+++ b/demo/demo_loftr.py
@ -0,0 +1,240 @@
+front_matter = """
+------------------------------------------------------------------------
+Online demo for [LoFTR](https://zju3dv.github.io/loftr/).
+
+This demo is heavily inspired by [SuperGlue](https://github.com/magicleap/SuperGluePretrainedNetwork/).
+We thank the authors for their execellent work.
+------------------------------------------------------------------------
+"""
+
+import os
+import argparse
+from pathlib import Path
+import cv2
+import torch
+import numpy as np
+import matplotlib.cm as cm
+
+os.sys.path.append("../")  # Add the project directory
+from src.loftr import LoFTR, default_cfg
+from src.config.default import get_cfg_defaults
+try:
+    from demo.utils import (AverageTimer, VideoStreamer,
+                            make_matching_plot_fast, make_matching_plot, frame2tensor)
+except:
+    raise ImportError("This demo requires utils.py from SuperGlue, please use run_demo.sh to start this script.")
+
+
+torch.set_grad_enabled(False)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='LoFTR online demo',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--weight', type=str, help="Path to the checkpoint.")
+    parser.add_argument(
+        '--input', type=str, default='0',
+        help='ID of a USB webcam, URL of an IP camera, '
+             'or path to an image directory or movie file')
+    parser.add_argument(
+        '--output_dir', type=str, default=None,
+        help='Directory where to write output frames (If None, no output)')
+    parser.add_argument(
+        '--image_glob', type=str, nargs='+', default=['*.png', '*.jpg', '*.jpeg'],
+        help='Glob if a directory of images is specified')
+    parser.add_argument(
+        '--skip', type=int, default=1,
+        help='Images to skip if input is a movie or directory')
+    parser.add_argument(
+        '--max_length', type=int, default=1000000,
+        help='Maximum length if input is a movie or directory')
+    parser.add_argument(
+        '--resize', type=int, nargs='+', default=[640, 480],
+        help='Resize the input image before running inference. If two numbers, '
+             'resize to the exact dimensions, if one number, resize the max '
+             'dimension, if -1, do not resize')
+    parser.add_argument(
+        '--no_display', action='store_true',
+        help='Do not display images to screen. Useful if running remotely')
+    parser.add_argument(
+        '--save_video', action='store_true',
+        help='Save output (with match visualizations) to a video.')
+    parser.add_argument(
+        '--save_input', action='store_true',
+        help='Save the input images to a video (for gathering repeatable input source).')
+    parser.add_argument(
+        '--skip_frames', type=int, default=1, 
+        help="Skip frames from webcam input.")
+    parser.add_argument(
+        '--top_k', type=int, default=2000, help="The max vis_range (please refer to the code).")
+    parser.add_argument(
+        '--bottom_k', type=int, default=0, help="The min vis_range (please refer to the code).")
+
+    opt = parser.parse_args()
+    print(front_matter)
+    parser.print_help()
+
+    if len(opt.resize) == 2 and opt.resize[1] == -1:
+        opt.resize = opt.resize[0:1]
+    if len(opt.resize) == 2:
+        print('Will resize to {}x{} (WxH)'.format(
+            opt.resize[0], opt.resize[1]))
+    elif len(opt.resize) == 1 and opt.resize[0] > 0:
+        print('Will resize max dimension to {}'.format(opt.resize[0]))
+    elif len(opt.resize) == 1:
+        print('Will not resize images')
+    else:
+        raise ValueError('Cannot specify more than two integers for --resize')
+
+    if torch.cuda.is_available():
+        device = 'cuda' 
+    else:
+        raise RuntimeError("GPU is required to run this demo.")
+
+    # Initialize LoFTR
+    matcher = LoFTR(config=default_cfg)
+    matcher.load_state_dict(torch.load(opt.weight)['state_dict'])
+    matcher = matcher.eval().to(device=device)
+
+    # Configure I/O
+    if opt.save_video:
+        print('Writing video to loftr-matches.mp4...')
+        writer = cv2.VideoWriter('loftr-matches.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 15, (640*2 + 10, 480))
+    if opt.save_input:
+        print('Writing video to demo-input.mp4...')
+        input_writer = cv2.VideoWriter('demo-input.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 15, (640, 480))
+
+    vs = VideoStreamer(opt.input, opt.resize, opt.skip,
+                       opt.image_glob, opt.max_length)
+    frame, ret = vs.next_frame()
+    assert ret, 'Error when reading the first frame (try different --input?)'
+
+    frame_id = 0  
+    last_image_id = 0
+    frame_tensor = frame2tensor(frame, device)
+    last_data = {'image0': frame_tensor}
+    last_frame = frame
+
+    if opt.output_dir is not None:
+        print('==> Will write outputs to {}'.format(opt.output_dir))
+        Path(opt.output_dir).mkdir(exist_ok=True)
+
+    # Create a window to display the demo.
+    if not opt.no_display:
+        window_name = 'LoFTR Matches'
+        cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
+        cv2.resizeWindow(window_name, (640*2, 480))
+    else:
+        print('Skipping visualization, will not show a GUI.')
+
+    # Print the keyboard help menu.
+    print('==> Keyboard control:\n'
+          '\tn: select the current frame as the reference image (left)\n'
+          '\td/f: move the range of the matches (ranked by confidence) to visualize\n'
+          '\tc/v: increase/decrease the length of the visualization range (i.e., total number of matches) to show\n'
+          '\tq: quit')
+
+    timer = AverageTimer()
+    vis_range = [opt.bottom_k, opt.top_k]
+
+    while True:
+        frame_id += 1
+        frame, ret = vs.next_frame()
+        if frame_id % opt.skip_frames != 0:
+            # print("Skipping frame.")
+            continue
+        if opt.save_input:
+            inp = np.stack([frame]*3, -1)
+            inp_rgb = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
+            input_writer.write(inp_rgb)
+        if not ret:
+            print('Finished demo_loftr.py')
+            break
+        timer.update('data')
+        stem0, stem1 = last_image_id, vs.i - 1
+
+        frame_tensor = frame2tensor(frame, device)
+        last_data = {**last_data, 'image1': frame_tensor}
+        matcher(last_data)
+
+        total_n_matches = len(last_data['mkpts0_f'])
+        mkpts0 = last_data['mkpts0_f'].cpu().numpy()[vis_range[0]:vis_range[1]]
+        mkpts1 = last_data['mkpts1_f'].cpu().numpy()[vis_range[0]:vis_range[1]]
+        mconf = last_data['mconf'].cpu().numpy()[vis_range[0]:vis_range[1]]
+
+        # Normalize confidence.
+        if len(mconf) > 0:
+            conf_vis_min = 0.
+            conf_min = mconf.min()
+            conf_max = mconf.max()
+            mconf = (mconf - conf_vis_min) / (conf_max - conf_vis_min + 1e-5)
+
+        timer.update('forward')
+        alpha = 0
+        color = cm.jet(mconf, alpha=alpha)
+
+        text = [
+            f'LoFTR',
+            '# Matches (showing/total): {}/{}'.format(len(mkpts0), total_n_matches),
+        ]
+        small_text = [
+            f'Showing matches from {vis_range[0]}:{vis_range[1]}',
+            f'Confidence Range: {conf_min:.2f}:{conf_max:.2f}',
+            'Image Pair: {:06}:{:06}'.format(stem0, stem1),
+        ]
+        out = make_matching_plot_fast(
+            last_frame, frame, mkpts0, mkpts1, mkpts0, mkpts1, color, text,
+            path=None, show_keypoints=False, small_text=small_text)
+
+        # Save high quality png, optionally with dynamic alpha support (unreleased yet).
+        # save_path = 'demo_vid/{:06}'.format(frame_id)
+        # make_matching_plot(
+        #     last_frame, frame, mkpts0, mkpts1, mkpts0, mkpts1, color, text,
+        #     path=save_path, show_keypoints=opt.show_keypoints, small_text=small_text)
+
+        if not opt.no_display:
+            if opt.save_video:
+                writer.write(out)
+            cv2.imshow('LoFTR Matches', out)
+            key = chr(cv2.waitKey(1) & 0xFF)
+            if key == 'q':
+                if opt.save_video:
+                    writer.release()
+                if opt.save_input:
+                    input_writer.release()
+                vs.cleanup()
+                print('Exiting...')
+                break
+            elif key == 'n':  
+                last_data['image0'] = frame_tensor
+                last_frame = frame
+                last_image_id = (vs.i - 1)
+                frame_id_left = frame_id
+            elif key in ['d', 'f']:
+                if key == 'd':
+                    if vis_range[0] >= 0:
+                       vis_range[0] -= 200
+                       vis_range[1] -= 200
+                if key =='f':
+                    vis_range[0] += 200
+                    vis_range[1] += 200
+                print(f'\nChanged the vis_range to {vis_range[0]}:{vis_range[1]}')
+            elif key in ['c', 'v']:
+                if key == 'c':
+                    vis_range[1] -= 50
+                if key =='v':
+                    vis_range[1] += 50
+                print(f'\nChanged the vis_range[1] to {vis_range[1]}')
+        elif opt.output_dir is not None:
+            stem = 'matches_{:06}_{:06}'.format(stem0, stem1)
+            out_file = str(Path(opt.output_dir, stem + '.png'))
+            print('\nWriting image to {}'.format(out_file))
+            cv2.imwrite(out_file, out)
+        else:
+            raise ValueError("output_dir is required when no display is given.")
+        timer.update('viz')
+        timer.print()
+
+
+    cv2.destroyAllWindows()
+    vs.cleanup()
--- a/demo/run_demo.sh
+++ b/demo/run_demo.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+# set -x
+
+if [ ! -f utils.py ]; then
+    echo "Downloading utils.py from the SuperGlue repo."
+    echo "We cannot provide this file directly due to its strict licence."
+    wget https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/master/models/utils.py
+fi
+
+# Use webcam 0 as input source. 
+input=0
+# or use a pre-recorded video given the path.
+# input=/home/sunjiaming/Downloads/scannet_test/$scene_name.mp4
+
+# Toggle indoor/outdoor model here.
+model_ckpt=../weights/indoor_ds.ckpt
+# model_ckpt=../weights/outdoor_ds.ckpt
+
+# Optionally assign the GPU ID.
+# export CUDA_VISIBLE_DEVICES=0
+
+echo "Running LoFTR demo.."
+eval "$(conda shell.zsh hook)"
+conda activate loftr
+python demo_loftr.py --weight $model_ckpt --input $input
+# To save the input video and output match visualizations.
+# python demo_loftr.py --weight $model_ckpt --input $input --save_video --save_input
+
+# Running on remote GPU servers with no GUI.
+# Save images first.
+# python demo_loftr.py --weight $model_ckpt --input $input --no_display --output_dir="./demo_images/"
+# Then convert them to a video.
+# ffmpeg -framerate 15 -pattern_type glob -i '*.png' -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4