DeepSORT Algorithm For Object Tracking
Shashank V Raghavan??
Artificial Intelligence?| Autonomous Systems??| Resident Robot Geek??| Quantum Computing??| Product and Program Management??
DeepSORT (Deep Simple Online and Realtime Tracking) is an advanced object tracking algorithm that builds upon the original SORT (Simple Online and Realtime Tracking) by incorporating deep learning for more robust performance, especially in complex environments with occlusions and similar-looking objects.
Background: SORT Recap
SORT uses:
Limitations of SORT:
Key Components of DeepSORT:
Detection: Requires external object detectors like YOLO, Faster R-CNN, or SSD to provide bounding boxes for objects in each frame.
Motion Model (Kalman Filter): Predicts the object's position in the next frame based on its current motion (velocity, position, etc.).
Data Association (Hungarian Algorithm): Matches current detections with existing tracked objects using a cost matrix.
Appearance Descriptor (Deep Learning):
Combined Cost Metric:
领英推荐
How DeepSORT Works?
Detection (Input): Requires external object detectors like YOLO, Faster R-CNN, or SSD to provide bounding boxes for objects in each frame.
Motion Model (Prediction): Kalman Filter predicts the next position of each tracked object based on its previous state (position, velocity, etc.).
Appearance Descriptor (Deep Learning):
Data Association (Matching):
Track Management:
Why DeepSORT is Effective
Implementation of DeepSORT for object tracking using YOLOv5 as the object detector. This setup will help us detect and track multiple objects in a video stream.
pip install torch torchvision torchaudio
pip install opencv-python
pip install numpy
pip install filterpy
pip install scikit-learn
pip install yolov5
import cv2
import torch
import numpy as np
from filterpy.kalman import KalmanFilter
from scipy.spatial import distance
from yolov5 import YOLOv5
# Load YOLOv5 model
model = YOLOv5("yolov5s.pt") # Use 'yolov5s' for speed, 'yolov5m' or 'yolov5l' for better accuracy
# Kalman Filter Tracker class
class Tracker:
def __init__(self, bbox, feature, tracker_id):
self.kalman = KalmanFilter(dim_x=7, dim_z=4)
self.kalman.F = np.array([
[1, 0, 0, 0, 1, 0, 0],
[0, 1, 0, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 1]
])
self.kalman.H = np.array([
[1, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0]
])
self.kalman.R[2:, 2:] *= 10.
self.kalman.P[4:, 4:] *= 1000.
self.kalman.P *= 10.
self.kalman.Q[-1, -1] *= 0.01
self.kalman.Q[4:, 4:] *= 0.01
self.kalman.x[:4] = bbox.reshape((4, 1))
self.feature = feature
self.tracker_id = tracker_id
self.hits = 1
self.no_losses = 0
def predict(self):
self.kalman.predict()
def update(self, bbox, feature):
self.kalman.update(bbox)
self.feature = feature
self.hits += 1
self.no_losses = 0
# Feature extraction using a simple color histogram (placeholder for a CNN-based descriptor)
def get_features(image, bbox):
x1, y1, x2, y2 = map(int, bbox)
crop = image[y1:y2, x1:x2]
hist = cv2.calcHist([crop], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
return cv2.normalize(hist, hist).flatten()
# Data association using cosine similarity
def associate_detections(tracks, detections, features):
if len(tracks) == 0:
return np.empty((0, 2), dtype=int), np.arange(len(detections)), []
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32)
for i, track in enumerate(tracks):
for j, feature in enumerate(features):
cost_matrix[i, j] = distance.cosine(track.feature, feature)
matched_indices = []
while cost_matrix.size > 0 and cost_matrix.min() < 0.5: # Threshold
i, j = np.unravel_index(cost_matrix.argmin(), cost_matrix.shape)
matched_indices.append((i, j))
cost_matrix[i, :] = 1
cost_matrix[:, j] = 1
unmatched_tracks = list(set(range(len(tracks))) - {i for i, _ in matched_indices})
unmatched_detections = list(set(range(len(detections))) - {j for _, j in matched_indices})
return matched_indices, unmatched_tracks, unmatched_detections
# Video capture
cap = cv2.VideoCapture("input_video.mp4")
trackers = []
tracker_id = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# YOLOv5 detection
results = model.predict(frame)
detections = results.xyxy[0].numpy()
# Extract features for detections
features = [get_features(frame, det[:4]) for det in detections]
# Predict new locations for all tracks
for tracker in trackers:
tracker.predict()
# Associate detections to existing tracks
matches, unmatched_tracks, unmatched_detections = associate_detections(trackers, detections, features)
# Update matched trackers
for track_idx, det_idx in matches:
bbox = detections[det_idx][:4]
feature = features[det_idx]
trackers[track_idx].update(bbox, feature)
# Create new trackers for unmatched detections
for det_idx in unmatched_detections:
bbox = detections[det_idx][:4]
feature = features[det_idx]
trackers.append(Tracker(bbox, feature, tracker_id))
tracker_id += 1
# Remove lost trackers
trackers = [t for t in trackers if t.no_losses < 5]
# Draw bounding boxes
for tracker in trackers:
x1, y1, x2, y2 = map(int, tracker.kalman.x[:4])
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(frame, f'ID: {tracker.tracker_id}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
cv2.imshow("DeepSORT Tracking", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
How This Works: