Module heyvi.recognition

Expand source code Browse git
import os
import sys
import random
import torch
import vipy
import vipy.data.meva
import shutil
import numpy as np
from vipy.util import remkdir, filetail, readlist, tolist, filepath
from datetime import datetime
from heyvi.model.yolov3.network import Darknet
import vipy.activity
import itertools
import copy
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
import torch.utils.data
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision import transforms
import pytorch_lightning as pl
import json
import math
import heyvi.label
import heyvi.model.ResNets_3D_PyTorch.resnet

try:
    import scipy.special
except:
    pass 


class ActivityRecognition(object):
    def __init__(self, pretrained=True):
        self.net =  None
        self._class_to_index = {}
        self._index_to_class = None
        self._num_frames = 0

    def class_to_index(self, c=None):
        return self._class_to_index if c is None else self._class_to_index[c]
    
    def index_to_class(self, index=None):
        if self._index_to_class is None:
            self._index_to_class = {v:k for (k,v) in self.class_to_index().items()}  # cache
        return self._index_to_class if index is None else self._index_to_class[index]
    
    def classlist(self):
        return [k for (k,v) in sorted(list(self.class_to_index().items()), key=lambda x: x[0])]  # sorted in index order

    def num_classes(self):
        return len(set(self.index_to_class().keys()))

    def fromindex(self, k):
        index_to_class = self.index_to_class()
        assert k in index_to_class, "Invalid class index '%s'" % (str(k))
        return index_to_class[k]

    def label_confidence(self, video=None, tensor=None, threshold=None):
        raise
        logits = self.__call__(video, tensor)
        conf = [[(self.index_to_class(j), s[j]) for j in i[::-1] if threshold is None or s[j]>threshold] for (s,i) in zip(logits, np.argsort(logits, axis=1))]
        return conf if len(logits) > 1 else conf[0]

    def activity(self, video, threshold=None):
        (c,s) = zip(*self.label_confidence(video=video, threshold=None))
        return vipy.activity.Activity(startframe=0, endframe=self._num_frames, category=c[0], actorid=video.actorid(), confidence=s[0]) if (threshold is None or s[0]>threshold) else None
            
    def top1(self, video=None, tensor=None, threshold=None):
        raise
        return self.topk(k=1, video=video, tensor=tensor, threshold=threshold)

    def topk(self, k, video=None, tensor=None, threshold=None):
        raise
        logits = self.__call__(video, tensor)
        topk = [[self.index_to_class(j) for j in i[-k:][::-1] if threshold is None or s[j] >= threshold] for (s,i) in zip(logits, np.argsort(logits, axis=1))]
        return topk if len(topk) > 1 else topk[0]

    def temporal_support(self):
        return self._num_frames

    def totensor(self, training=False):
        raise

    def binary_vector(self, categories):
        y = np.zeros(len(self.classlist())).astype(np.float32)
        for c in tolist(categories):
            y[self.class_to_index(c)] = 1
        return torch.from_numpy(y).type(torch.FloatTensor)
        
    
    
class PIP_250k(pl.LightningModule, ActivityRecognition):
    """Activity recognition using people in public - 250k stabilized"""
    
    def __init__(self, pretrained=True, deterministic=False, modelfile=None, mlbl=False, mlfl=True, unitnorm=False, bgbce=False):

        # FIXME: remove dependencies here
        from heyvi.model.pyvideoresearch.bases.resnet50_3d import ResNet503D, ResNet3D, Bottleneck3D
        import heyvi.model.ResNets_3D_PyTorch.resnet
        
        super().__init__()
        self._input_size = 112
        self._num_frames = 16        
        self._mean = [0.485, 0.456, 0.406]
        self._std = [0.229, 0.224, 0.225]
        self._mlfl = mlfl
        self._mlbl = mlbl
        self._bgbce = bgbce
        self._unitnorm = unitnorm

        if deterministic:
            np.random.seed(42)

        self._class_to_weight = {'car_drops_off_person': 1.4162811344926518, 'car_picks_up_person': 1.4103618337303332, 'car_reverses': 1.0847976470131024, 'car_starts': 1.0145749063037774, 'car_stops': 0.6659236295324015, 'car_turns_left': 2.942269221156227, 'car_turns_right': 1.1077783089040996, 'hand_interacts_with_person_highfive': 2.793646013249904, 'person': 0.4492053391155403, 'person_abandons_object': 1.0944029463871692, 'person_carries_heavy_object': 0.5848339202761978, 'person_closes_car_door': 0.8616907697519004, 'person_closes_car_trunk': 1.468393359799126, 'person_closes_facility_door': 0.8927495923340439, 'person_embraces_person': 0.6072654081071569, 'person_enters_car': 1.3259274145537951, 'person_enters_scene_through_structure': 0.6928103470838287, 'person_exits_car': 1.6366577285051707, 'person_exits_scene_through_structure': 0.8368692178634396, 'person_holds_hand': 1.2378881634203558, 'person_interacts_with_laptop': 1.6276031281396193, 'person_loads_car': 2.170167410167583, 'person_opens_car_door': 0.7601817241565009, 'person_opens_car_trunk': 1.7255285914206204, 'person_opens_facility_door': 0.9167411017455822, 'person_picks_up_object_from_floor': 1.123251610875369, 'person_picks_up_object_from_table': 3.5979689180114205, 'person_purchases_from_cashier': 7.144918373837205, 'person_purchases_from_machine': 5.920886403645001, 'person_puts_down_object_on_floor': 0.7295795950752353, 'person_puts_down_object_on_shelf': 9.247614426653692, 'person_puts_down_object_on_table': 1.9884672074906158, 'person_reads_document': 0.7940480628992879, 'person_rides_bicycle': 2.662661823600623, 'person_shakes_hand': 0.7819547332927879, 'person_sits_down': 0.8375202893491961, 'person_stands_up': 1.0285510019795079, 'person_steals_object_from_person': 1.0673909796893626, 'person_talks_on_phone': 0.3031855242664589, 'person_talks_to_person': 0.334895684562076, 'person_texts_on_phone': 0.713951043919232, 'person_transfers_object_to_car': 3.2832615561297605, 'person_transfers_object_to_person': 0.9633429807282274, 'person_unloads_car': 1.1051597100801462, 'vehicle': 1.1953172363332243}
        self._class_to_weight['person_puts_down_object_on_shelf'] = 1.0   # run 5

        self._class_to_index = {'car_drops_off_person': 0, 'car_picks_up_person': 1, 'car_reverses': 2, 'car_starts': 3, 'car_stops': 4, 'car_turns_left': 5, 'car_turns_right': 6, 'hand_interacts_with_person_highfive': 7, 'person': 8, 'person_abandons_object': 9, 'person_carries_heavy_object': 10, 'person_closes_car_door': 11, 'person_closes_car_trunk': 12, 'person_closes_facility_door': 13, 'person_embraces_person': 14, 'person_enters_car': 15, 'person_enters_scene_through_structure': 16, 'person_exits_car': 17, 'person_exits_scene_through_structure': 18, 'person_holds_hand': 19, 'person_interacts_with_laptop': 20, 'person_loads_car': 21, 'person_opens_car_door': 22, 'person_opens_car_trunk': 23, 'person_opens_facility_door': 24, 'person_picks_up_object_from_floor': 25, 'person_picks_up_object_from_table': 26, 'person_purchases_from_cashier': 27, 'person_purchases_from_machine': 28, 'person_puts_down_object_on_floor': 29, 'person_puts_down_object_on_shelf': 30, 'person_puts_down_object_on_table': 31, 'person_reads_document': 32, 'person_rides_bicycle': 33, 'person_shakes_hand': 34, 'person_sits_down': 35, 'person_stands_up': 36, 'person_steals_object_from_person': 37, 'person_talks_on_phone': 38, 'person_talks_to_person': 39, 'person_texts_on_phone': 40, 'person_transfers_object_to_car': 41, 'person_transfers_object_to_person': 42, 'person_unloads_car': 43, 'vehicle': 44}

        self._verb_to_noun = {k:set(['car','vehicle','motorcycle','bus','truck']) if (k.startswith('car') or k.startswith('motorcycle') or k.startswith('vehicle')) else set(['person']) for k in self.classlist()}        
        self._class_to_shortlabel = pycollector.label.pip_to_shortlabel  # FIXME: remove dependency here

        if pretrained:
            self._load_pretrained()
            self.net.fc = nn.Linear(self.net.fc.in_features, self.num_classes())
        elif modelfile is not None:
            self._load_trained(modelfile)
        
    def category(self, x):
        yh = self.forward(x if x.ndim == 5 else torch.unsqueeze(x, 0))
        return [self.index_to_class(int(k)) for (c,k) in zip(*torch.max(yh, dim=1))]

    def category_confidence(self, x):
        yh = self.forward(x if x.ndim == 5 else torch.unsqueeze(x, 0))
        return [(self.index_to_class(int(k)), float(c)) for (c,k) in zip(*torch.max(yh, dim=1))]

    def topk(self, x_logits, k):
        yh = x_logits.detach().cpu().numpy()
        topk = [[(self.index_to_class(j), s[j]) for j in i[-k:][::-1]] for (s,i) in zip(yh, np.argsort(yh, axis=1))]
        return topk

    def topk_probability(self, x_logits, k):
        yh = x_logits.detach().cpu().numpy()
        yh_prob = F.softmax(x_logits, dim=1).detach().cpu().numpy()
        topk = [[(self.index_to_class(j), c[j], p[j]) for j in i[-k:][::-1]] for (c,p,i) in zip(yh, yh_prob, np.argsort(yh, axis=1))]
        return topk
        
    # ---- <LIGHTNING>
    def forward(self, x):
        return self.net(x)  # lighting handles device

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, batch, batch_nb, logging=True, valstep=False):
        (x,Y) = batch  
        y_hat = self.forward(x)
        y_hat_softmax = F.softmax(y_hat, dim=1)

        (loss, n_valid, y_validation) = (0, 0, [])
        #C = torch.tensor([self._index_to_training_weight[v] for (k,v) in sorted(self._class_to_index.items(), key=lambda x: x[1])], device=y_hat.device)  # inverse class frequency        
        C = torch.tensor([v for (k,v) in sorted(self._index_to_training_weight.items(), key=lambda x: x[0])], device=y_hat.device)  # inverse class frequency        
        for (yh, yhs, labelstr) in zip(y_hat, y_hat_softmax, Y):
            labels = json.loads(labelstr)
            if labels is None:
                continue  # skip me
            lbllist = [l for lbl in labels for l in set(lbl)]  # list of multi-labels within clip (unpack from JSON to use default collate_fn)
            lbllist = [l for l in lbllist if l in self._class_to_index]  # only allowable classes
            lbl_frequency = vipy.util.countby(lbllist, lambda x: x)  # frequency within clip
            lbl_weight = {k:v/float(len(lbllist)) for (k,v) in lbl_frequency.items()}  # multi-label likelihood within clip, sums to one            
            for (y,w) in lbl_weight.items():
                if valstep:
                    # Pick all labels normalized (https://papers.nips.cc/paper/2019/file/da647c549dde572c2c5edc4f5bef039c-Paper.pdf
                    loss += float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                elif self._mlfl and not self._bgbce:
                    # Pick all labels normalized, with multi-label focal loss
                    loss += torch.min(torch.tensor(1.0, device=y_hat.device), ((w-yhs[self._class_to_index[y]])/w)**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                elif self._mlfl and self._bgbce:
                    if self._index_to_class[self._class_to_index[y]].endswith('_moves'):  # background class (FIXME)
                        loss += float(self._index_to_training_weight[self._class_to_index[y]])*float(w)*F.binary_cross_entropy_with_logits(torch.unsqueeze(yh, dim=0), torch.zeros_like(torch.unsqueeze(yh, dim=0))) # background regularization (all zeros)
                    else:
                        loss += torch.min(torch.tensor(1.0, device=y_hat.device), ((w-yhs[self._class_to_index[y]])/w)**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                elif self._mlbl:
                    # Pick all labels normalized with multi-label background loss
                    #j_bg_person = self._class_to_index['person'] if 'person' in self._class_to_index else self._class_to_index['person_walks']  # FIXME: does not generalize
                    #j_bg_vehicle = self._class_to_index['vehicle'] if 'vehicle' in self._class_to_index else self._class_to_index['car_moves']  # FIXME: does not generalize
                    #j = j_bg_person if (y.startswith('person') or y.startswith('hand')) else j_bg_vehicle
                    #loss += ((1-torch.sqrt(yhs[j]*yhs[self._class_to_index[y]]))**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                    raise ('Deprecated')                    
                else:
                    # Pick all labels normalized: https://papers.nips.cc/paper/2019/file/da647c549dde572c2c5edc4f5bef039c-Paper.pdf
                    loss += float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)

            n_valid += 1
            if len(lbllist) > 0:
                y_validation.append( (yh, self._class_to_index[max(lbllist, key=lbllist.count)]) )  # most frequent label in clip
        loss = loss / float(max(1, n_valid))  # batch reduction: mean

        if logging:
            self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss if not valstep else {'loss': loss, 'logit': torch.stack([x for (x,c) in y_validation]), 'classindex': [c for (x,c) in y_validation]}

    def validation_step(self, batch, batch_nb):
        loss = self.training_step(batch, batch_nb, logging=False, valstep=True)['loss']
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
        return {'val_loss': loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        self.log('val_loss', avg_loss, on_epoch=True, prog_bar=False, logger=True)
        self.log('avg_val_loss', avg_loss, on_epoch=True, prog_bar=True, logger=True)                
        return {'val_loss': avg_loss, 'avg_val_loss': avg_loss}                         
    #---- </LIGHTNING>
    
    @classmethod
    def from_checkpoint(cls, checkpointpath):
        return cls().load_from_checkpoint(checkpointpath)  # lightning
            
    def _load_trained(self, ckptfile):
        self.net = heyvi.model.ResNets_3D_PyTorch.resnet.generate_model(50, n_classes=self.num_classes(), unitnorm=self._unitnorm)
        t = torch.split(self.net.conv1.weight.data, dim=1, split_size_or_sections=1)
        self.net.conv1.weight.data = torch.cat( (*t, t[-1]), dim=1).contiguous()
        self.net.conv1.in_channels = 4  # inflate RGB -> RGBA
        self.load_state_dict(torch.load(ckptfile)['state_dict'])  # FIXME
        self.eval()
        return self
        
    def _load_pretrained(self):

        pthfile = vipy.downloader.downloadif('https://dl.dropboxusercontent.com/s/t3xge6lrfqpklr0/r3d50_kms_200ep.pth',
                                                vipy.util.tocache('r3d50_KMS_200ep.pth'),  # set VIPY_CACHE env 
                                                sha1='39ea626355308d8f75307cab047a8d75862c3261')
        
        net = heyvi.model.ResNets_3D_PyTorch.resnet.generate_model(50, n_classes=1139, unitnorm=self._unitnorm)
        pretrain = torch.load(pthfile, map_location='cpu')
        net.load_state_dict(pretrain['state_dict'])

        # Inflate RGB -> RGBA         
        t = torch.split(net.conv1.weight.data, dim=1, split_size_or_sections=1)
        net.conv1.weight.data = torch.cat( (*t, t[-1]), dim=1).contiguous()
        net.conv1.in_channels = 4

        self.net = net

        return self

    @staticmethod
    def _totensor(v, training, validation, input_size, num_frames, mean, std, noflip=None, show=False, doflip=False):
        assert isinstance(v, vipy.video.Scene), "Invalid input"
        
        try:
            v = v.download() if (not v.isloaded() and not v.hasfilename() and v.hasurl()) else v  # fetch it if necessary, but do not do this during training!        
            if training or validation:
                random.seed()  # force randomness after fork() 
                (ai,aj) = (v.primary_activity().startframe(), v.primary_activity().endframe())  # activity (start,end)
                (ti,tj) = (v.actor().startframe(), v.actor().endframe())  # track (start,end) 
                startframe = random.randint(max(0, ti-(num_frames//2)), max(1, tj-(num_frames//2)))  # random startframe that contains track
                endframe = min((startframe+num_frames), aj)  # endframe truncated to be end of activity
                (startframe, endframe) = (startframe, endframe) if (startframe < endframe) else (max(0, aj-num_frames), aj)  # fallback
                assert endframe - startframe <= num_frames
                vc = v.clone().clip(startframe, endframe)    # may fail for some short clips
                vc = vc.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)   
                vc = vc.fliplr() if (doflip or (random.random() > 0.5)) and (noflip is None or vc.category() not in noflip) else vc
            else:
                vc = v.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)  # TESTING: this may introduce a preview()
                vc = vc.fliplr() if doflip and (noflip is None or vc.category() not in noflip) else vc
                
            if show:
                vc.clone().resize(512,512).show(timestamp=True)
                vc.clone().binarymask().frame(0).rgb().show(figure='binary mask: frame 0')
                
            vc = vc.load(shape=(input_size, input_size, 3)).normalize(mean=mean, std=std, scale=1.0/255.0)  # [0,255] -> [0,1], triggers load() with known shape
            (t,lbl) = vc.torch(startframe=0, length=num_frames, boundary='cyclic', order='cdhw', withlabel=training or validation, nonelabel=True)  # (c=3)x(d=num_frames)x(H=input_size)x(W=input_size), reuses vc._array
            t = torch.cat((t, vc.asfloatmask(fg=0.5, bg=-0.5).torch(startframe=0, length=num_frames, boundary='cyclic', order='cdhw')), dim=0)  # (c=4) x (d=num_frames) x (H=input_size) x (W=input_size), copy
            
        except Exception as e:
            if training or validation:
                #print('ERROR: %s' % (str(v)))
                t = torch.zeros(4, num_frames, input_size, input_size)  # skip me
                lbl = None
            else:
                print('WARNING: discarding tensor for video "%s" with exception "%s"' % (str(v), str(e)))
                t = torch.zeros(4, num_frames, input_size, input_size)  # skip me (should never get here)
            
        if training or validation:
            return (t, json.dumps(lbl))  # json to use default collate_fn
        else:
            return t

    def totensor(self, v=None, training=False, validation=False, show=False, doflip=False):
        """Return captured lambda function if v=None, else return tensor"""    
        assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
        f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show:
             PIP_250k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right'], show=show, doflip=doflip))
        return f(v) if v is not None else f
    


class PIP_370k(PIP_250k, pl.LightningModule, ActivityRecognition):

    def __init__(self, pretrained=True, deterministic=False, modelfile=None, mlbl=False, mlfl=True, unitnorm=False, bgbce=False):
        pl.LightningModule.__init__(self)
        ActivityRecognition.__init__(self)  

        self._input_size = 112
        self._num_frames = 16        
        self._mean = [0.485, 0.456, 0.406]
        self._std = [0.229, 0.224, 0.225]
        self._mlfl = mlfl
        self._mlbl = mlbl
        self._calibrated = False
        self._calibrated_constant = None  # -1.5
        self._bgbce = bgbce
        self._unitnorm = unitnorm

        if deterministic:
            np.random.seed(42)
        
        # Generated using vipy.dataset.Dataset(...).multilabel_inverse_frequency_weight()
        self._class_to_training_weight = {'car_drops_off_person': 0.7858124882763793, 'car_moves': 0.18439798528529147, 'car_picks_up_person': 0.7380666753394193, 'car_reverses': 0.5753369570213479, 'car_starts': 0.47486292483745757, 'car_stops': 0.44244800737774037, 'car_turns_left': 0.7697107319736983, 'car_turns_right': 0.5412936796835607, 'hand_interacts_with_person': 0.2794031245117859, 'person_abandons_package': 1.0789960714517162, 'person_carries_heavy_object': 0.5032333530901552, 'person_closes_car_door': 0.46460114438995603, 'person_closes_car_trunk': 0.6824201392305784, 'person_closes_facility_door': 0.38990434394080076, 'person_embraces_person': 0.6457437695527715, 'person_enters_car': 0.6934926810021877, 'person_enters_scene_through_structure': 0.2586965095740063, 'person_exits_car': 0.6766386632434479, 'person_exits_scene_through_structure': 0.33054895987676847, 'person_interacts_with_laptop': 0.6720176496986436, 'person_loads_car': 0.6880555743488312, 'person_opens_car_door': 0.4069868136393968, 'person_opens_car_trunk': 0.6911966903970317, 'person_opens_facility_door': 0.3018924474724252, 'person_picks_up_object': 0.4298381074082487, 'person_purchases_from_cashier': 5.479834409621331, 'person_purchases_from_machine': 5.31528236654537, 'person_puts_down_object': 0.2804690906037155, 'person_reads_document': 0.5476186269530937, 'person_rides_bicycle': 1.6090962879286763, 'person_sits_down': 0.4750148103149501, 'person_stands_up': 0.5022364750834624, 'person_steals_object': 0.910991409921711, 'person_talks_on_phone': 0.15771902851484076, 'person_talks_to_person': 0.21362675034201736, 'person_texts_on_phone': 0.3328378404741194, 'person_transfers_object_to_car': 2.964890512157848, 'person_transfers_object_to_person': 0.6481292773603928, 'person_unloads_car': 0.515379337544623, 'person_walks': 6.341278284010202}
        self._class_to_weight = self._class_to_training_weight  # backwards compatibility

        # Generated using vipy.dataset.Dataset(...).class_to_index()
        self._class_to_index = {'car_drops_off_person': 0, 'car_moves': 1, 'car_picks_up_person': 2, 'car_reverses': 3, 'car_starts': 4, 'car_stops': 5, 'car_turns_left': 6, 'car_turns_right': 7, 'hand_interacts_with_person': 8, 'person_abandons_package': 9, 'person_carries_heavy_object': 10, 'person_closes_car_door': 11, 'person_closes_car_trunk': 12, 'person_closes_facility_door': 13, 'person_embraces_person': 14, 'person_enters_car': 15, 'person_enters_scene_through_structure': 16, 'person_exits_car': 17, 'person_exits_scene_through_structure': 18, 'person_interacts_with_laptop': 19, 'person_loads_car': 20, 'person_opens_car_door': 21, 'person_opens_car_trunk': 22, 'person_opens_facility_door': 23, 'person_picks_up_object': 24, 'person_purchases_from_cashier': 25, 'person_purchases_from_machine': 26, 'person_puts_down_object': 27, 'person_reads_document': 28, 'person_rides_bicycle': 29, 'person_sits_down': 30, 'person_stands_up': 31, 'person_steals_object': 32, 'person_talks_on_phone': 33, 'person_talks_to_person': 34, 'person_texts_on_phone': 35, 'person_transfers_object_to_car': 36, 'person_transfers_object_to_person': 37, 'person_unloads_car': 38, 'person_walks': 39}
        
        self._index_to_training_weight = {self._class_to_index[k]:v for (k,v) in self._class_to_weight.items()}

        self._verb_to_noun = {k:set(['car','vehicle','motorcycle','bus','truck']) if (k.startswith('car') or k.startswith('motorcycle') or k.startswith('vehicle')) else set(['person']) for k in self.classlist()}        
        self._class_to_shortlabel = heyvi.label.pip_to_shortlabel
        self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )

        if pretrained:
            self._load_pretrained()
            self.net.fc = nn.Linear(self.net.fc.in_features, self.num_classes())
        elif modelfile is not None:
            self._load_trained(modelfile)

    def topk(self, x, k=None):
        """Return the top-k classes for a 3 second activity proposal along with framewise ground truth"""        
        yh = self.forward(x if x.ndim == 5 else x.unsqueeze(0)).detach().cpu().numpy()
        k = k if k is not None else self.num_classes()
        return [ [self.index_to_class(int(j)) for j in i[-k:][::-1]] for (s,i) in zip(yh, np.argsort(yh, axis=1))]
            
    @staticmethod
    def _totensor(v, training, validation, input_size, num_frames, mean, std, noflip=None, show=False, doflip=False, stride_jitter=3, asjson=False, classname='heyvi.recognition.PIP_370k'):
        assert isinstance(v, vipy.video.Scene), "Invalid input"
        
        try:
            v = v.download() if (not v.isloaded() and v.hasurl() and not v.hasfilename()) else v  # fetch it if necessary, but do not do this during training!        
            if training or validation:
                vc = v.clone()  # these frames are shared across all tracks
                random.seed()  # force randomness after fork() 
                (clipstart, clipend) = vc.cliprange()  # clip (start, end) relative to video 
                (clipstart, clipend) = (clipstart if clipstart is not None else 0,   
                                        clipend if clipend is not None else int(np.floor(v.duration_in_frames_of_videofile() * (vc.framerate() / v.framerate_of_videofile()))))  # (yuck)
                # WARNINGS: 
                # - There exist videos with tracks outside the image rectangle due to the padding in stabilization.  
                # - There exist MEVA videos that have no tracks at the beginning and end of the padded clip since the annotations only exist for the activity
                # - There exist MEVA videos with activities that are longer than the tracks, if so, keep the interval of the activity that contains the track
                # - There exist MEVA videos with multiple objects, need to include only primary actor
                
                # - turning activities may be outside the frame (filter these)
                # - turning activities may turn into the stabilized black area.  Is this avoidaable?
                # - all of the training activities should be centered on the activity.  See if not.
                
                if (clipend - clipstart) > (num_frames + stride_jitter):
                    a = vc.primary_activity().clone().padto(num_frames/float(vc.framerate()))  # for context only, may be past end of clip now!
                    (ai, aj) = (a.startframe(), a.endframe())  # activity (start,end) relative to (clipstart, clipend)
                    (ai, aj) = (max(ai, vc.actor().startframe()), min(aj, vc.actor().endframe()))  # clip activity to when actor is present
                    startframe = random.randint(ai, aj-num_frames-1) if aj-num_frames-1 > ai else ai
                    startframe = max(0, startframe + random.randint(-stride_jitter, stride_jitter))   # +/- 3 frames jitter for activity stride
                    endframe = min(clipend-clipstart-1, startframe + num_frames)  # new end cannot be past duration of clip
                    if (endframe > startframe) and ((endframe - startframe) < (clipend - clipstart)):
                        vc = vc.clip(startframe, endframe)
                    else: 
                        raise ValueError('invalid clip for "%s"' % str(v))
                vc = vc.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)   
                vc = vc.fliplr() if (doflip or (random.random() > 0.5)) and (noflip is None or vc.category() not in noflip) else vc
            else:
                vc = v.clone(shallow=True)  # shallow clone, trackcrop() will result in new array and v will be preserved
                vc = vc.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)  # This may introduce a preview()
                vc = vc.fliplr() if doflip and (noflip is None or vc.category() not in noflip) else vc
                
            if show:
                vc.clone().resize(512,512).show(timestamp=True)
                vc.clone().binarymask().frame(0).gain(255).rgb().show(figure='binary mask: frame 0')
                
            vc = vc.load(shape=(input_size, input_size, 3)).normalize(mean=mean, std=std, scale=1.0/255.0)  # [0,255] -> [0,1], triggers load() with known shape
            (t,lbl) = vc.torch(startframe=0, length=num_frames, boundary='repeat', order='cdhw', withlabel=training or validation, nonelabel=True)  # (c=3)x(d=num_frames)x(H=input_size)x(W=input_size), reuses vc._array
            t = torch.cat((t, vc.asfloatmask(fg=0.5, bg=-0.5).torch(startframe=0, length=num_frames, boundary='repeat', order='cdhw')), dim=0)  # (c=4) x (d=num_frames) x (H=input_size) x (W=input_size), copy

        except Exception as e:
            if training or validation:
                print('[heyvi.recognition.%s._totensor][SKIPPING]: video="%s", exception="%s"' % (classname, str(vc), str(e)))
                (t, lbl) = (torch.zeros(4, num_frames, input_size, input_size), None)  # must always return conformal tensor (label=None means it will be ignored)
            else:
                print('[heyvi.recognition.%s._totensor][ERROR]: discarding tensor for video "%s" with exception "%s"' % (classname, str(vc), str(e)))
                #t = torch.zeros(4, num_frames, input_size, input_size)  # skip me (should never get here)
                raise

        if training or validation:
            return (t, json.dumps(lbl) if not asjson else lbl)  # json to use default torch collate_fn
        else:
            return t

    def totensor(self, v=None, training=False, validation=False, show=False, doflip=False, asjson=False):
        """Return captured lambda function if v=None, else return tensor"""
        assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
        f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show, classname=self.__class__.__name__:
             PIP_370k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right', 'vehicle_turns_left', 'vehicle_turns_right', 'motorcycle_turns_left', 'motorcycle_turns_right'], show=show, doflip=doflip, asjson=asjson, classname=classname))
        return f(v) if v is not None else f


class CAP(PIP_370k, pl.LightningModule, ActivityRecognition):
    def __init__(self, modelfile=None, deterministic=False, pretrained=None, mlbl=None, mlfl=True, calibrated_constant=None, calibrated=False, unitnorm=False, bgbce=False):
        pl.LightningModule.__init__(self)
        ActivityRecognition.__init__(self)  

        self._input_size = 112
        self._num_frames = 16        
        self._mean = [0.485, 0.456, 0.406]
        self._std = [0.229, 0.224, 0.225]
        self._mlfl = True
        self._mlbl = False
        self._calibrated_constant = calibrated_constant
        self._calibrated = calibrated
        self._unitnorm = unitnorm
        self._bgbce = bgbce

        if deterministic:
            np.random.seed(42)

        version = 5  
        
        if version == 1:
            print('[heyvi.recognition.CAP]: version == 1')  # cap_l2norm_e23s96095.ckpt and earlier

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            # - WARNING: under-represented classes are truncated at a maximum weight of one
            # - python 3.7 can use importlib.resources
            self._class_to_training_weight = {k:float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_training_weight.csv'))}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility
            
            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class.csv'))}
            self._index_to_training_weight = {self._class_to_index[k]:v for (k,v) in self._class_to_weight.items()}
            
            # Generated using vipy.dataset.Dataset.class_to_shortlabel()
            self._class_to_shortlabel = dict(vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_shortlabel.csv')))
            self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )

        elif version == 2:
            print('[heyvi.recognition.CAP]: version==2')

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            # - WARNING: under-represented classes are truncated at a maximum weight of one
            self._index_to_training_weight = {int(k):float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'coarse_index_to_training_weight.csv'))}

            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'coarse_class_to_index.csv'))}
            self._index_to_class = {int(k):v for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'coarse_index_to_class.csv'))}

            # Derived
            self._class_to_training_weight = {k:self._index_to_training_weight[v] for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility

        elif version == 3:
            print('[heyvi.recognition.CAP]: version==3')

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            self._index_to_training_weight = {int(k):float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_index_to_training_weight.csv'))}

            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_class_to_index.csv'))}
            self._index_to_class = {int(k):v for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_index_to_class.csv'))}

            # Derived
            self._class_to_training_weight = {k:self._index_to_training_weight[v] for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility

        elif version == 4:
            print('[heyvi.recognition.CAP]: version==4')

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            self._index_to_training_weight = {int(k):float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'joint_background_index_to_training_weight.csv'))}

            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_class_to_index.csv'))}
            self._index_to_class = {int(k):v for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_index_to_class.csv'))}

            # Derived
            self._class_to_training_weight = {k:self._index_to_training_weight[v] for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility
            
        elif version == 5:
            print('[heyvi.recognition.CAP]: version == 5') 

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            # - WARNING: under-represented classes are truncated at a maximum weight of one
            # - python 3.7 can use importlib.resources
            self._index_to_training_weight = {k:float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'index_to_meva_training_weight.csv'))}
            
            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class.csv'))}
            self._class_to_training_weight = {k:self._index_to_training_weight[v] if v in self._index_to_training_weight else 0 for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility            

            # Generated using vipy.dataset.Dataset.class_to_shortlabel()
            self._class_to_shortlabel = dict(vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_shortlabel.csv')))
            self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )

        else:
            raise
            
        # Generated using vipy.dataset.Dataset.class_to_shortlabel()
        self._class_to_shortlabel = dict(vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_shortlabel.csv')))
        self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )


        # Calibration state: trained at validation epoch end
        if self._calibrated:
            self.register_buffer('_calibration_multiclass', torch.zeros(1,1))
            self.register_buffer('_calibration_binary', torch.zeros(3,self.num_classes()))

        if modelfile is not None:
            self._load_trained(modelfile)
        else:
            self._load_pretrained()
            self.net.fc = nn.Linear(self.net.fc.in_features, self.num_classes())

    
    #---- <LIGHTNING>
    def forward(self, x):
        return self.net(x)  

    def validation_step(self, batch, batch_nb):
        s = self.training_step(batch, batch_nb, logging=False, valstep=True)
        self.log('val_loss', s['loss'], on_step=True, on_epoch=True, prog_bar=False, logger=True)
        return {'val_loss': s['loss'], 'logit': s['logit'], 'classindex':s['classindex']}

    def validation_step_end(self, outputs):
        return {'val_loss': outputs['val_loss'],  
                'logit': outputs['logit'],  # accumulated across GPUs (single machine multi-GPU only)
                'classindex': outputs['classindex']}

    def validation_epoch_end(self, outputs):        
        avg_loss = torch.cat([x['val_loss'].flatten() for x in outputs]).mean()
        self.log('val_loss', avg_loss, on_epoch=True, prog_bar=False, logger=True)
        self.log('avg_val_loss', avg_loss, on_epoch=True, prog_bar=True, logger=True)  # for checkpointing

        # Calibration: will be saved as registered buffer in checkpoint for calibration
        if self._calibrated and self.trainer.is_global_zero:
            from netcal.scaling import LogisticCalibration, TemperatureScaling
            logits = torch.stack([x for output in outputs for x in output['logit']]).detach().cpu().numpy()
            ground_truth = torch.cat([x.flatten() for output in outputs for x in output['classindex']]).flatten().detach().cpu().numpy()
            multiclass = TemperatureScaling()
            multiclass.fit(F.softmax(torch.from_numpy(logits), dim=1).cpu().numpy(), ground_truth)
            binary = {k:(LogisticCalibration(), float(np.mean(logits[:,k]))) for k in sorted(self.class_to_index().values())}
            for (k,(b,m)) in binary.items():
                (binary_confidences, binary_ground_truth) = (torch.sigmoid(torch.from_numpy(logits[:,k]-m).flatten()).cpu().numpy(), np.array([1 if y==k else 0 for y in ground_truth]))
                if np.any(binary_ground_truth):
                    b.fit(binary_confidences, binary_ground_truth)
                else:
                    binary[k] = (None,0)  # no samples for calibration, prediction will always be zero

            with torch.no_grad():
                calibration_multiclass = torch.tensor(float(multiclass._sites['weights']['values']))
                calibration_binary = torch.stack((torch.tensor([float(b._sites['weights']['values']) if b is not None else 0 for (k,(b,m)) in binary.items()]), 
                                                  torch.tensor([float(b._sites['bias']['values']) if b is not None else 0 for (k,(b,m)) in binary.items()]), 
                                                  torch.tensor([float(m) if b is not None else 0 for (k,(b,m)) in binary.items()])))
            
                self._calibration_multiclass[:] = calibration_multiclass.to(device=torch.device('cuda:0'))
                self._calibration_binary[:] = calibration_binary.to(device=torch.device('cuda:0'))
    #---- </LIGHTNING>
        
    def totensor(self, v=None, training=False, validation=False, show=False, doflip=False, asjson=False):
        """Return captured lambda function if v=None, else return tensor"""    
        assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
        f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show, classname=self.__class__.__name__, doflip=doflip, asjson=asjson:
             PIP_370k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right', 'vehicle_turns_left', 'vehicle_turns_right', 'motorcycle_turns_left', 'motorcycle_turns_right'], show=show, doflip=doflip, asjson=asjson, classname=classname))
        return f(v) if v is not None else f

    def calibration(self, x_logits):
        assert torch.is_tensor(self._calibration_multiclass) and self._calibration_multiclass.shape == (1,1)
        assert torch.is_tensor(self._calibration_binary) and self._calibration_binary.shape == (3, self.num_classes())
        (n, T, (w,b,o), eps) = (self.num_classes(), self._calibration_multiclass, self._calibration_binary, np.finfo(np.float64).eps)  # (TemperatureScaling, PlattScaling=(weight, bias, offset))

        lr = torch.from_numpy(np.multiply(1.0/(1+np.exp(-(np.multiply(np.array(w*0 + 1).reshape(1,n).astype(np.float64), scipy.special.logit(np.clip(torch.sigmoid(x_logits-o.view(1,n)).detach().cpu().numpy().astype(np.float64), eps, 1-eps))))+np.array(b).reshape(1,n).astype(np.float64))), (np.array(o)!=0).reshape(1,n)).astype(np.float32))
        sm = F.softmax(torch.log(torch.clamp(F.softmax(x_logits, dim=1), eps, 1-eps)) / T, dim=1)
        
        #sm = F.softmax(x_logits / T, dim=1)  # temperature only
        #lr = torch.multiply(torch.sigmoid(x_logits-o.view(1,n)+b.view(1,n)), (o!=0).view(1,n))   # bias only
        return torch.multiply(sm, lr)

        
class ActivityTracker(PIP_370k):
    """Video Activity detection.
        
    Args (__call__):
        vi [generator of `vipy.video.Scene`]:  The input video to be updated in place with detections.  This is a generator which is output from heyvi.detection.MultiscaleVideoTracker.__call__
        activityiou [float]: The minimum temporal iou for activity assignment
        mirror [bool]:  If true, encode using the mean of a video encoding and the mirrored video encoding.  This is slower as it requires 2x GPU forward passes
        minprob [float]: The minimum probability for new activity detection
        trackconf [float]: The minimum object detection confidence for new tracks
        maxdets [int]:  The maximum number of allowable detections per frame.  If there are more detections per frame tha maxdets, sort them by confidence and use only the top maxdets best
        avgdets [int]:  The number of allowable detections per frame if throttled
        buffered [bool]:  If true, then buffer streams.  This is useful for activity detection on live streams.            
        finalized [bool, int]:  If False then do not finalize(), If True finalize() only at the end, If int, then finalize every int frames.  This is useful for streaming activity detection on unbounded inputs. 
        
    Returns:
        The input video is updated in place.
    
    """    
    def __init__(self, stride=3, activities=None, gpus=None, batchsize=None, mlbl=False, mlfl=True, modelfile=None):
        assert modelfile is not None, "Contact <info@visym.com> for access to non-public model files"

        super().__init__(pretrained=False, modelfile=modelfile, mlbl=mlbl, mlfl=mlfl)
        self._stride = stride
        self._allowable_activities = {k:v for (k,v) in [(a,a) if not isinstance(a, tuple) else a for a in activities]} if activities is not None else {k:k for k in self.classlist()}
        self._verb_to_noun = {k:set(['car','vehicle','motorcycle','bus','truck']) if (k.startswith('car') or k.startswith('motorcycle') or k.startswith('vehicle')) else set(['person']) for k in self.classlist()+list(self._allowable_activities.values())+list(self._allowable_activities.keys())}  # HACK: fixme
        self._batchsize_per_gpu = batchsize
        self._gpus = gpus

        if gpus is not None:
            assert torch.cuda.is_available()
            assert batchsize is not None
            self._devices = ['cuda:%d' % k for k in gpus]
            self._gpus = [copy.deepcopy(self.net).to(d, non_blocking=False) for d in self._devices]  
            for m in self._gpus:
                m.eval()
        torch.set_grad_enabled(False)

        self._logit_pooling = None  # cache
        
    def temporal_stride(self, s=None):
        if s is not None:
            self._stride = s
            return self
        else:
            return self._stride

    def forward(self, x):
        """Overload forward for multi-gpu batch.  Don't use torch DataParallel!"""
        if self._gpus is None:
            return super().forward(x)  # cpu
        else:
            x_forward = None            
            for b in x.pin_memory().split(self._batchsize_per_gpu*len(self._gpus)):  # pinned copy
                n_todevice = np.sum(np.array([1 if k<len(b) else 0 for k in range(int(len(self._devices)*np.ceil(len(b)/len(self._devices))))]).reshape(-1, len(self._devices)), axis=0).tolist()
                todevice = [t.to(d, non_blocking=True) for (t,d) in zip(b.split(n_todevice), self._devices) if len(t)>0]   # async device copy
                ondevice = [m(t) for (m,t) in zip(self._gpus, todevice)]   # async
                fromdevice = torch.cat([t.cpu() for t in ondevice], dim=0)
                x_forward = fromdevice if x_forward is None else torch.cat((x_forward, fromdevice), dim=0)
                del ondevice, todevice, fromdevice, b  # force garbage collection of GPU memory
            del x  # force garbage collection
            return x_forward

    def lrt(self, x_logits, lrt_threshold=None):
        """top-k with likelihood ratio test with background null hypothesis"""
        j_bg_person = self._class_to_index['person'] if 'person' in self._class_to_index else self._class_to_index['person_walks']  # FIXME
        j_bg_vehicle = self._class_to_index['vehicle'] if 'vehicle' in self._class_to_index else self._class_to_index['car_moves']  # FIXME

        yh = x_logits.detach().cpu().numpy()
        yh_softmax = F.softmax(x_logits, dim=1).detach().cpu()
        p_null = np.maximum(yh[:, j_bg_person], yh[:, j_bg_vehicle]).reshape(yh.shape[0], 1)
        lr = yh - p_null   # ~= log likelihood ratio
        f_logistic = lambda x,b,s=1.0: float(1.0 / (1.0 + np.exp(-s*(x + b))))
        return [sorted([(self.index_to_class(j), float(s[j]), float(t[j]), f_logistic(s[j], 1.0)*f_logistic(t[j], 0.0), float(sm[j])) for j in range(len(s)) if (lrt_threshold is None or t[j] >= lrt_threshold)], key=lambda x: x[3], reverse=True) for (s,t,sm) in zip(yh, lr, yh_softmax)]

    def logit_pooling(self, x_logits, classmap):
        if self._logit_pooling is None:
            assert isinstance(classmap, dict)
            index_to_class = {k:c for (k,c) in enumerate(set([k if v is None else v for (k,v) in classmap.items()]))}  # target class index
            class_to_index = {v:k for (k,v) in index_to_class.items()}  # target class to index
            class_to_group = [[self.class_to_index(c[0]) for c in v if c[0] in self.class_to_index()] for (k,v) in sorted(vipy.util.groupbyasdict([(k,k) if v is None else (k,v) for (k,v) in classmap.items()], lambda x: x[1]).items(), key=lambda x: class_to_index[x[0]])]  # target class to source index
            self._logit_pooling = {'index_to_class': index_to_class, 'class_to_group': class_to_group}
        (index_to_class, class_to_group) = (self._logit_pooling['index_to_class'], self._logit_pooling['class_to_group'])  # cached
        yh = torch.tensor([max(r[j]) if len(r[j])>0 else (min(r)) for (i,r) in enumerate(x_logits) for j in class_to_group]).reshape(len(x_logits), len(index_to_class)).detach()
        yh_softmax = F.softmax(yh, dim=1).detach().cpu()
        return [[(index_to_class[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh.cpu(), yh_softmax)]
        
    def softmax(self, x_logits):
        """Return a list of lists [(class_label, float(softmax), float(logit) ... ] for all classes and batches"""
        yh = x_logits.detach().cpu().numpy()
        yh_softmax = F.softmax(x_logits, dim=1).detach().cpu()
        d = self.index_to_class()
        if not self._calibrated:
            return [[(d[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh, yh_softmax)]
        else:
            yh_softmax = self.calibration(x_logits)
            return [[(d[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh, yh_softmax)]            

    def finalize(self, vo, trackconf=None, activityconf=None, startframe=None, endframe=None, mintracklen=None):
        """In place filtering of video to finalize"""
        assert isinstance(vo, vipy.video.Scene)

        tofinalize = set([ai for (ai,a) in vo.activities().items() if (endframe is None or a.endframe() <= endframe) and (startframe is None or a.endframe() >= startframe)])
        tofinalize = tofinalize.union([ti for (ti,t) in vo.tracks().items() if ((endframe is None or t.endframe() <= endframe) and (startframe is None or t.endframe() >= startframe)) or any([ti == vo.activities(id=ai).actorid() for ai in tofinalize])])

        # Bad tracks:  Remove low confidence or too short non-moving tracks, and associated activities
        # - will throw exception that 'vo referenced before assignment' if one loop did not succceed
        if trackconf is not None:
            vo.trackfilter(lambda t: t.id() not in tofinalize or len(t)>=(mintracklen if mintracklen is not None else vo.framerate()) and (t.confidence() >= trackconf or t.startbox().iou(t.endbox()) == 0)).activityfilter(lambda a: a.id() not in tofinalize or a.actorid() in vo.tracks())  
        
        # Activity probability:  noun_probability*verb probability
        nounconf = {k:t.confidence(samples=8) for (k,t) in vo.tracks().items() if t.id() in tofinalize}   # 
        vo.activitymap(lambda a: a.confidence(nounconf[a.actorid()]*a.confidence()) if a.id() in tofinalize else a)
        
        # Missing objects:  Significantly reduce confidence of complex classes (yuck)
        vo.activitymap(lambda a: a.confidence(0.01*a.confidence()) if (a.id() in tofinalize and a.category() in ['person_purchases']) else a) 
        
        # Vehicle turns:  High confidence vehicle turns must be a minimum angle
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      (a.category() in ['vehicle_turns_left', 'vehicle_turns_right']) and
                                                                      (abs(vo.track(a.actorid()).bearing_change(a.startframe(), a.endframe(), dt=vo.framerate(), samples=5)) < (np.pi/16))) else a) 
        
        # Vehicle turns:  U-turn can only be distinguished from left/right turn at the end of a track by looking at the turn angle
        vo.activitymap(lambda a: a.category('vehicle_makes_u_turn').shortlabel('u turn') if (a.id() in tofinalize and
                                                                                             (a.category() in ['vehicle_turns_left', 'vehicle_turns_right']) and
                                                                                             (abs(vo.track(a.actorid()).bearing_change(a.startframe(), a.endframe(), dt=vo.framerate(), samples=5)) > (np.pi-(np.pi/2)))) else a)
        
        # Background activities:  Use logistic confidence on logit due to lack of background class "person stands", otherwise every standing person is using a phone
        if self._calibrated_constant is not None:            
            f_logistic = lambda x,b,s=1.0: float(1.0 / (1.0 + np.exp(-s*(x + b))))
            vo.activitymap(lambda a: a.confidence(a.confidence()*f_logistic(a.attributes['logit'], self._calibrated_constant)) if a.id() in tofinalize else a)  
            
        # Complex activities: remove steal/abandon and replace with picks up / puts down
        vo.activityfilter(lambda a: a.category() not in ['person_steals_object', 'person_abandons_package'])
        newlist = [vo.add(vipy.activity.Activity(startframe=a.startframe(), endframe=a.endframe(), category='person_steals_object', shortlabel='steals', confidence=0.5*a.confidence(), framerate=vo.framerate(), actorid=a.actorid(), attributes={'pip':'person_picks_up_object'}))
                   for a in vo.activitylist() if a.category() == 'person_picks_up_object']
        newlist = [vo.add(vipy.activity.Activity(startframe=a.startframe(), endframe=a.endframe(), category='person_abandons_package', shortlabel='abandons', confidence=0.5*a.confidence(), framerate=vo.framerate(), actorid=a.actorid(), attributes={'pip':'person_puts_down_object'}))
                   for a in vo.activitylist() if a.category() == 'person_puts_down_object']
            
        # Vehicle/person interaction: 'vehicle_drops_off_person'/'vehicle_picks_up_person'  must be followed by car driving away/pulling up, must be accompanied by person track start/end
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      a.category() == 'vehicle_drops_off_person' and
                                                                      (not vo.track(a.actorid()).ismoving(a.middleframe(), a.endframe()+10*vo.framerate()) or
                                                                       not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), t.startframe(), t.startframe()+1) > 0 for t in vo.tracks().values()]))) else a)
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      a.category() == 'vehicle_picks_up_person' and
                                                                      (not vo.track(a.actorid()).ismoving(a.startframe()-10*vo.framerate(), a.middleframe()) or
                                                                       not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), t.endframe()-1, t.endframe()) > 0 for t in vo.tracks().values()]))) else a)
        
        # Person/Bicycle track: riding must be accompanied by an associated moving bicycle track
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.category() != 'person_rides_bicycle')
        bikelist = [vo.add(vipy.activity.Activity(startframe=t.startframe(), endframe=t.endframe(), category='person_rides_bicycle', shortlabel='rides', confidence=t.confidence(samples=8), framerate=vo.framerate(), actorid=t.id(), attributes={'pip':'person_rides_bicycle'}))
                    for (tk,t) in vo.tracks().items() if (t.id() in tofinalize and t.category() == 'bicycle' and t.ismoving())]
        
        # Person/Vehicle track: person/vehicle interaction must be accompanied by an associated stopped vehicle track
        dstbox = {k:vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for (k,a) in vo.activities().items() if (a.id() in tofinalize and a.category().startswith('person') and ('vehicle' in a.category() or 'trunk' in a.category()))}  # precompute            
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      (a.category().startswith('person') and ('vehicle' in a.category() or 'trunk' in a.category())) and
                                                                      not any([t.category() == 'vehicle' and 
                                                                               t.during(a.startframe()) and
                                                                               not t.ismoving(a.startframe(), a.endframe()) and
                                                                               t[a.startframe()].hasintersection(dstbox[a._id])
                                                                               for t in vo.tracks().values()])) else a)
        
        # Vehicle/Person track: vehicle/person interaction must be accompanied by an associated person track
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and (a.category().startswith('vehicle') and ('person' in a.category())) and not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), a._startframe, a._endframe) > 0 for t in vo.tracks().values()])) else a)
        
        # Person track: enter/exit scene cannot be at the image boundary
        boundary = vo.framebox().dilate(0.9)
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and a.category() == 'person_enters_scene_through_structure' and vo.track(a.actorid())[max(a.startframe(), vo.track(a.actorid()).startframe())].cover(boundary) < 1) else a)
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and a.category() == 'person_exits_scene_through_structure' and vo.track(a.actorid())[min(a.endframe(), vo.track(a.actorid()).endframe())].cover(boundary) < 1) else a)
        
        # Activity union:  Temporal gaps less than support should be merged into one activity detection for a single track
        # Activity union:  "Brief" breaks (<5 seconds) of confident activities should be merged into one activity detection for a single track
        briefmerge = set(['person_reads_document', 'person_interacts_with_laptop', 'person_talks_to_person', 'person_purchases', 'person_steals_object', 'person_talks_on_phone', 'person_texts_on_phone', 'person_rides_bicycle', 'person_carries_heavy_object', 'person', 'person_walks', 'vehicle', 'car_moves'])  
        merged = set([])
        mergeable_dets = [a for a in vo.activities().values() if a.id() in tofinalize and a.confidence() > 0.2]  # only mergeable detections
        mergeable_dets.sort(key=lambda a: a.startframe())  # in-place
        for a in mergeable_dets:
            for o in mergeable_dets:
                if ((o._startframe >= a._startframe) and (a._id != o._id) and (o._actorid == a._actorid) and (o._label == a._label) and (o._id not in merged) and (a._id not in merged) and
                    ((a.temporal_distance(o) <= self.temporal_support() or (a.category() in briefmerge and a.temporal_distance(o) < 5*vo.framerate())))):
                    a.union(o)  # in-place update
                    merged.add(o.id())
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.id() not in merged)

        # Group activity: Must be accompanied by a friend with the same activity detection
        categories = ['person_embraces_person', 'hand_interacts_with_person', 'person_talks_to_person', 'person_transfers_object']           
        dstbox = {k:vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for (k,a) in vo.activities().items() if a.id() in tofinalize and a.category() in categories}  # precompute
        srcbox = {k:bb.clone().maxsquare().dilate(1.2) for (k,bb) in dstbox.items()}                            
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      a._label in categories and
                                                                      not any([(af._label == a._label and
                                                                                af._id != a._id and
                                                                                af._actorid != a._actorid and 
                                                                                srcbox[a._id].hasintersection(dstbox[af._id]))
                                                                               for af in vo.activities().values() if af.during_interval(a._startframe, a._endframe, inclusive=True)])) else a)
        
        # Activity group suppression:  Group activities may have at most one activity detection of this type per group in a spatial region surrounding the actor
        tosuppress = set(['hand_interacts_with_person', 'person_embraces_person', 'person_transfers_object', 'person_steals_object', 'person_purchases', 'person_talks_to_person'])
        suppressed = set([])
        activitybox = {a.id():vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for a in vo.activities().values() if a.id() in tofinalize and a.category() in tosuppress}
        activitybox = {k:bb.dilate(1.2).maxsquare() if bb is not None else bb for (k,bb) in activitybox.items()}
        candidates = [a for a in vo.activities().values() if a.id() in tofinalize]
        for a in sorted(candidates, key=lambda a: a.confidence(), reverse=True):  # decreasing confidence
            if a.category() in tosuppress:
                for o in candidates:  # other activities
                    if (o._actorid != a._actorid and  # different tracks
                        o._label == a._label and  # same category
                        o.confidence() <= a.confidence() and   # lower confidence
                        o._id not in suppressed and  # not already suppressed
                        o.during_interval(a.startframe(), a.endframe()) and # overlaps temporally by at least one frame
                        (activitybox[a._id] is not None and activitybox[o._id] is not None) and   # has valid tracks
                        activitybox[a._id].hasintersection(activitybox[o._id]) and  # has coarse overlap 
                        vo.track(a.actorid()).clone().maxsquare().dilate(1.2).segment_maxiou(vo.track(o.actorid()), a.startframe(), a.endframe()) > 0):  # has fine overlap "close by"
                        suppressed.add(o.id())  # greedy non-maximum suppression of lower confidence activity detection
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.id() not in suppressed)

        # Activity duration
        vo.activitymap(lambda a: a.padto(5) if a.id() in tofinalize and a.category() in ['person_talks_to_person', 'person_interacts_with_laptop', 'person_reads_document', 'person_purchases'] else a)   
        vo.activitymap(lambda a: a.duration(2, centered=False) if a.id() in tofinalize and a.category() in ['person_opens_vehicle_door', 'person_closes_vehicle_door'] else a)
        vo.activitymap(lambda a: a.duration(2, centered=True) if a.id() in tofinalize and a.category() in ['person_enters_scene_through_structure', 'person_exits_scene_through_structure'] else a)
        vo.activitymap(lambda a: a.startframe(0) if a.id() in tofinalize and a.startframe() < 0 else a)

        # Activity confidence
        if activityconf is not None:
            vo.activityfilter(lambda a: a.id() not in tofinalize or a.confidence() >= activityconf)
    
        return vo

        
    def __call__(self, vi, activityiou=0.1, mirror=False, minprob=0.04, trackconf=0.2, maxdets=105, avgdets=70, throttle=True, buffered=True, finalized=True, mintracklen=None):
        (n,m,dt) = (self.temporal_support(), self.temporal_stride(), 1)  
        aa = self._allowable_activities  # dictionary mapping of allowable classified activities to output names        
        f_encode = self.totensor(training=False, validation=False, show=False, doflip=False)  # video -> tensor CxNxHxW
        f_mirror = lambda t: (t, torch.from_numpy(np.copy(np.flip(np.asarray(t), axis=3))))  # CxNxHxW -> CxNxHx(-W), np.flip is much faster than torch.flip, faster than encode mirror=True, np.flip returns a view which must be copied
        f_totensor = lambda v: (f_encode(v.clone(sharedarray=True) if mirror else v),) if (not mirror or v.actor().category() != 'person') else f_mirror(f_encode(v.clone(sharedarray=True)))  # do not mirror vehicle activities
        f_totensorlist = lambda V: [t for v in V for t in f_totensor(v)]        
        def f_reduce(T,V):
            j = sum([v.actor().category() == 'person' for v in V])  # person mirrored, vehicle not mirrored
            (tm, t) = torch.split(T, (2*j, len(T)-2*j), dim=0)  # assumes sorted order, person first, only person/vehicle
            return torch.cat((torch.mean(tm.view(-1, 2, tm.shape[1]), dim=1), t), dim=0) if j>0 else T  # mean over mirror augmentation

        try:
            with torch.no_grad():                                
                vp = next(vi)  # peek in generator to create clip
                vi = itertools.chain([vp], vi)  # unpeek
                sw = vipy.util.Stopwatch() if throttle else None  # real-time framerate estimate
                framerate = vp.framerate()
                for (k, (vo,vc)) in enumerate(zip(vi, vp.stream(buffered=buffered).clip(n, m, continuous=True, activities=False, delay=dt))):
                    videotracks = [] if vc is None else [vt for vt in vc.trackfilter(lambda t: len(t)>=4 and (t.category() == 'person' or (t.category() == 'vehicle' and vo.track(t.id()).ismoving(k-10*n+dt, k+dt)))).tracksplit()]  # vehicle moved recently?
                    if throttle:
                        videotracks.sort(key=lambda v: v.actor().confidence(last=1))  # in-place                                            
                        numdets = (maxdets if ((avgdets is None) or (sw.duration()<=60) or ((sw.duration()>60) and ((k/sw.duration())/vp.framerate())>0.8)) else
                                   (avgdets if ((k/sw.duration())/vp.framerate())>0.67 else int(avgdets//2)))   # real-time throttle schedule
                        videotracks = videotracks[-numdets:] if (numdets is not None and len(videotracks)>numdets) else videotracks   # select only the most confident for detection                
                    videotracks.sort(key=lambda v: v.actor().category())  # in-place, for grouping mirrored encoding: person<vehicle

                    if len(videotracks)>0 and (k+dt > n): 
                        logits = self.forward(torch.stack(f_totensorlist(videotracks))) # augmented logits in track index order, copy
                        logits = f_reduce(logits, videotracks) if mirror else logits  # reduced logits in track index order
                        (actorid, actorcategory) = ([t.actorid() for t in videotracks], [t.actor().category() for t in videotracks])
                        dets = [vipy.activity.Activity(category=aa[category], shortlabel=self._class_to_shortlabel[category], startframe=k-n+dt, endframe=k+dt, confidence=sm, framerate=framerate, actorid=actorid[j], attributes={'pip':category, 'logit':float(logit)})
                                for (j, category_sm_logit) in enumerate(self.softmax(logits))  # (classname, softmax, logit), unsorted
                                for (category, sm, logit) in category_sm_logit
                                if ((category in aa) and   # requested activities only
                                    (actorcategory[j] in self._verb_to_noun[category]) and   # noun matching with category renaming dictionary
                                    sm>=minprob)]   # minimum probability for new activity detection
                        vo.assign(k+dt, dets, activityiou=activityiou, activitymerge=False, activitynms=True)   # assign new activity detections by non-maximum suppression (merge happens at the end)
                        del logits, dets, videotracks  # torch garabage collection

                    if not isinstance(finalized, bool) and k > 0 and k%finalized == 0:
                        self.finalize(vo, trackconf=trackconf, startframe=k-finalized-5, endframe=k-5, mintracklen=mintracklen)  
                        
                    yield vo

        except Exception as e:                
            raise

        finally:
            if not (finalized is False):
                self.finalize(vo, trackconf=trackconf, mintracklen=mintracklen) if finalized == True else self.finalize(vo, trackconf=trackconf, startframe=(k//finalized)*finalized-4, endframe=k, mintracklen=mintracklen)


class ActivityTrackerCap(ActivityTracker, CAP):
    def __init__(self, stride=3, activities=None, gpus=None, batchsize=None, calibrated=False, modelfile=None, calibrated_constant=None, unitnorm=False):
        ActivityTracker. __init__(self, stride=stride, activities=activities, gpus=gpus, batchsize=batchsize, mlbl=False, mlfl=True, modelfile=modelfile)
        CAP.__init__(self, modelfile=modelfile, deterministic=False, pretrained=None, mlbl=None, mlfl=True, calibrated_constant=calibrated_constant, calibrated=calibrated, unitnorm=unitnorm)
        # FIXME: there is an issue with multiple inheritance and multi-gpu with default parameters here (unitnorm, mlfl), requires hardcoding currently 

Classes

class ActivityRecognition (pretrained=True)
Expand source code Browse git
class ActivityRecognition(object):
    def __init__(self, pretrained=True):
        self.net =  None
        self._class_to_index = {}
        self._index_to_class = None
        self._num_frames = 0

    def class_to_index(self, c=None):
        return self._class_to_index if c is None else self._class_to_index[c]
    
    def index_to_class(self, index=None):
        if self._index_to_class is None:
            self._index_to_class = {v:k for (k,v) in self.class_to_index().items()}  # cache
        return self._index_to_class if index is None else self._index_to_class[index]
    
    def classlist(self):
        return [k for (k,v) in sorted(list(self.class_to_index().items()), key=lambda x: x[0])]  # sorted in index order

    def num_classes(self):
        return len(set(self.index_to_class().keys()))

    def fromindex(self, k):
        index_to_class = self.index_to_class()
        assert k in index_to_class, "Invalid class index '%s'" % (str(k))
        return index_to_class[k]

    def label_confidence(self, video=None, tensor=None, threshold=None):
        raise
        logits = self.__call__(video, tensor)
        conf = [[(self.index_to_class(j), s[j]) for j in i[::-1] if threshold is None or s[j]>threshold] for (s,i) in zip(logits, np.argsort(logits, axis=1))]
        return conf if len(logits) > 1 else conf[0]

    def activity(self, video, threshold=None):
        (c,s) = zip(*self.label_confidence(video=video, threshold=None))
        return vipy.activity.Activity(startframe=0, endframe=self._num_frames, category=c[0], actorid=video.actorid(), confidence=s[0]) if (threshold is None or s[0]>threshold) else None
            
    def top1(self, video=None, tensor=None, threshold=None):
        raise
        return self.topk(k=1, video=video, tensor=tensor, threshold=threshold)

    def topk(self, k, video=None, tensor=None, threshold=None):
        raise
        logits = self.__call__(video, tensor)
        topk = [[self.index_to_class(j) for j in i[-k:][::-1] if threshold is None or s[j] >= threshold] for (s,i) in zip(logits, np.argsort(logits, axis=1))]
        return topk if len(topk) > 1 else topk[0]

    def temporal_support(self):
        return self._num_frames

    def totensor(self, training=False):
        raise

    def binary_vector(self, categories):
        y = np.zeros(len(self.classlist())).astype(np.float32)
        for c in tolist(categories):
            y[self.class_to_index(c)] = 1
        return torch.from_numpy(y).type(torch.FloatTensor)

Subclasses

Methods

def activity(self, video, threshold=None)
Expand source code Browse git
def activity(self, video, threshold=None):
    (c,s) = zip(*self.label_confidence(video=video, threshold=None))
    return vipy.activity.Activity(startframe=0, endframe=self._num_frames, category=c[0], actorid=video.actorid(), confidence=s[0]) if (threshold is None or s[0]>threshold) else None
def binary_vector(self, categories)
Expand source code Browse git
def binary_vector(self, categories):
    y = np.zeros(len(self.classlist())).astype(np.float32)
    for c in tolist(categories):
        y[self.class_to_index(c)] = 1
    return torch.from_numpy(y).type(torch.FloatTensor)
def class_to_index(self, c=None)
Expand source code Browse git
def class_to_index(self, c=None):
    return self._class_to_index if c is None else self._class_to_index[c]
def classlist(self)
Expand source code Browse git
def classlist(self):
    return [k for (k,v) in sorted(list(self.class_to_index().items()), key=lambda x: x[0])]  # sorted in index order
def fromindex(self, k)
Expand source code Browse git
def fromindex(self, k):
    index_to_class = self.index_to_class()
    assert k in index_to_class, "Invalid class index '%s'" % (str(k))
    return index_to_class[k]
def index_to_class(self, index=None)
Expand source code Browse git
def index_to_class(self, index=None):
    if self._index_to_class is None:
        self._index_to_class = {v:k for (k,v) in self.class_to_index().items()}  # cache
    return self._index_to_class if index is None else self._index_to_class[index]
def label_confidence(self, video=None, tensor=None, threshold=None)
Expand source code Browse git
def label_confidence(self, video=None, tensor=None, threshold=None):
    raise
    logits = self.__call__(video, tensor)
    conf = [[(self.index_to_class(j), s[j]) for j in i[::-1] if threshold is None or s[j]>threshold] for (s,i) in zip(logits, np.argsort(logits, axis=1))]
    return conf if len(logits) > 1 else conf[0]
def num_classes(self)
Expand source code Browse git
def num_classes(self):
    return len(set(self.index_to_class().keys()))
def temporal_support(self)
Expand source code Browse git
def temporal_support(self):
    return self._num_frames
def top1(self, video=None, tensor=None, threshold=None)
Expand source code Browse git
def top1(self, video=None, tensor=None, threshold=None):
    raise
    return self.topk(k=1, video=video, tensor=tensor, threshold=threshold)
def topk(self, k, video=None, tensor=None, threshold=None)
Expand source code Browse git
def topk(self, k, video=None, tensor=None, threshold=None):
    raise
    logits = self.__call__(video, tensor)
    topk = [[self.index_to_class(j) for j in i[-k:][::-1] if threshold is None or s[j] >= threshold] for (s,i) in zip(logits, np.argsort(logits, axis=1))]
    return topk if len(topk) > 1 else topk[0]
def totensor(self, training=False)
Expand source code Browse git
def totensor(self, training=False):
    raise
class ActivityTracker (stride=3, activities=None, gpus=None, batchsize=None, mlbl=False, mlfl=True, modelfile=None)

Video Activity detection.

Args (call): vi [generator of vipy.video.Scene]: The input video to be updated in place with detections. This is a generator which is output from heyvi.detection.MultiscaleVideoTracker.call activityiou [float]: The minimum temporal iou for activity assignment mirror [bool]: If true, encode using the mean of a video encoding and the mirrored video encoding. This is slower as it requires 2x GPU forward passes minprob [float]: The minimum probability for new activity detection trackconf [float]: The minimum object detection confidence for new tracks maxdets [int]: The maximum number of allowable detections per frame. If there are more detections per frame tha maxdets, sort them by confidence and use only the top maxdets best avgdets [int]: The number of allowable detections per frame if throttled buffered [bool]: If true, then buffer streams. This is useful for activity detection on live streams.
finalized [bool, int]: If False then do not finalize(), If True finalize() only at the end, If int, then finalize every int frames. This is useful for streaming activity detection on unbounded inputs.

Returns

The input video is updated in place.

Expand source code Browse git
class ActivityTracker(PIP_370k):
    """Video Activity detection.
        
    Args (__call__):
        vi [generator of `vipy.video.Scene`]:  The input video to be updated in place with detections.  This is a generator which is output from heyvi.detection.MultiscaleVideoTracker.__call__
        activityiou [float]: The minimum temporal iou for activity assignment
        mirror [bool]:  If true, encode using the mean of a video encoding and the mirrored video encoding.  This is slower as it requires 2x GPU forward passes
        minprob [float]: The minimum probability for new activity detection
        trackconf [float]: The minimum object detection confidence for new tracks
        maxdets [int]:  The maximum number of allowable detections per frame.  If there are more detections per frame tha maxdets, sort them by confidence and use only the top maxdets best
        avgdets [int]:  The number of allowable detections per frame if throttled
        buffered [bool]:  If true, then buffer streams.  This is useful for activity detection on live streams.            
        finalized [bool, int]:  If False then do not finalize(), If True finalize() only at the end, If int, then finalize every int frames.  This is useful for streaming activity detection on unbounded inputs. 
        
    Returns:
        The input video is updated in place.
    
    """    
    def __init__(self, stride=3, activities=None, gpus=None, batchsize=None, mlbl=False, mlfl=True, modelfile=None):
        assert modelfile is not None, "Contact <info@visym.com> for access to non-public model files"

        super().__init__(pretrained=False, modelfile=modelfile, mlbl=mlbl, mlfl=mlfl)
        self._stride = stride
        self._allowable_activities = {k:v for (k,v) in [(a,a) if not isinstance(a, tuple) else a for a in activities]} if activities is not None else {k:k for k in self.classlist()}
        self._verb_to_noun = {k:set(['car','vehicle','motorcycle','bus','truck']) if (k.startswith('car') or k.startswith('motorcycle') or k.startswith('vehicle')) else set(['person']) for k in self.classlist()+list(self._allowable_activities.values())+list(self._allowable_activities.keys())}  # HACK: fixme
        self._batchsize_per_gpu = batchsize
        self._gpus = gpus

        if gpus is not None:
            assert torch.cuda.is_available()
            assert batchsize is not None
            self._devices = ['cuda:%d' % k for k in gpus]
            self._gpus = [copy.deepcopy(self.net).to(d, non_blocking=False) for d in self._devices]  
            for m in self._gpus:
                m.eval()
        torch.set_grad_enabled(False)

        self._logit_pooling = None  # cache
        
    def temporal_stride(self, s=None):
        if s is not None:
            self._stride = s
            return self
        else:
            return self._stride

    def forward(self, x):
        """Overload forward for multi-gpu batch.  Don't use torch DataParallel!"""
        if self._gpus is None:
            return super().forward(x)  # cpu
        else:
            x_forward = None            
            for b in x.pin_memory().split(self._batchsize_per_gpu*len(self._gpus)):  # pinned copy
                n_todevice = np.sum(np.array([1 if k<len(b) else 0 for k in range(int(len(self._devices)*np.ceil(len(b)/len(self._devices))))]).reshape(-1, len(self._devices)), axis=0).tolist()
                todevice = [t.to(d, non_blocking=True) for (t,d) in zip(b.split(n_todevice), self._devices) if len(t)>0]   # async device copy
                ondevice = [m(t) for (m,t) in zip(self._gpus, todevice)]   # async
                fromdevice = torch.cat([t.cpu() for t in ondevice], dim=0)
                x_forward = fromdevice if x_forward is None else torch.cat((x_forward, fromdevice), dim=0)
                del ondevice, todevice, fromdevice, b  # force garbage collection of GPU memory
            del x  # force garbage collection
            return x_forward

    def lrt(self, x_logits, lrt_threshold=None):
        """top-k with likelihood ratio test with background null hypothesis"""
        j_bg_person = self._class_to_index['person'] if 'person' in self._class_to_index else self._class_to_index['person_walks']  # FIXME
        j_bg_vehicle = self._class_to_index['vehicle'] if 'vehicle' in self._class_to_index else self._class_to_index['car_moves']  # FIXME

        yh = x_logits.detach().cpu().numpy()
        yh_softmax = F.softmax(x_logits, dim=1).detach().cpu()
        p_null = np.maximum(yh[:, j_bg_person], yh[:, j_bg_vehicle]).reshape(yh.shape[0], 1)
        lr = yh - p_null   # ~= log likelihood ratio
        f_logistic = lambda x,b,s=1.0: float(1.0 / (1.0 + np.exp(-s*(x + b))))
        return [sorted([(self.index_to_class(j), float(s[j]), float(t[j]), f_logistic(s[j], 1.0)*f_logistic(t[j], 0.0), float(sm[j])) for j in range(len(s)) if (lrt_threshold is None or t[j] >= lrt_threshold)], key=lambda x: x[3], reverse=True) for (s,t,sm) in zip(yh, lr, yh_softmax)]

    def logit_pooling(self, x_logits, classmap):
        if self._logit_pooling is None:
            assert isinstance(classmap, dict)
            index_to_class = {k:c for (k,c) in enumerate(set([k if v is None else v for (k,v) in classmap.items()]))}  # target class index
            class_to_index = {v:k for (k,v) in index_to_class.items()}  # target class to index
            class_to_group = [[self.class_to_index(c[0]) for c in v if c[0] in self.class_to_index()] for (k,v) in sorted(vipy.util.groupbyasdict([(k,k) if v is None else (k,v) for (k,v) in classmap.items()], lambda x: x[1]).items(), key=lambda x: class_to_index[x[0]])]  # target class to source index
            self._logit_pooling = {'index_to_class': index_to_class, 'class_to_group': class_to_group}
        (index_to_class, class_to_group) = (self._logit_pooling['index_to_class'], self._logit_pooling['class_to_group'])  # cached
        yh = torch.tensor([max(r[j]) if len(r[j])>0 else (min(r)) for (i,r) in enumerate(x_logits) for j in class_to_group]).reshape(len(x_logits), len(index_to_class)).detach()
        yh_softmax = F.softmax(yh, dim=1).detach().cpu()
        return [[(index_to_class[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh.cpu(), yh_softmax)]
        
    def softmax(self, x_logits):
        """Return a list of lists [(class_label, float(softmax), float(logit) ... ] for all classes and batches"""
        yh = x_logits.detach().cpu().numpy()
        yh_softmax = F.softmax(x_logits, dim=1).detach().cpu()
        d = self.index_to_class()
        if not self._calibrated:
            return [[(d[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh, yh_softmax)]
        else:
            yh_softmax = self.calibration(x_logits)
            return [[(d[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh, yh_softmax)]            

    def finalize(self, vo, trackconf=None, activityconf=None, startframe=None, endframe=None, mintracklen=None):
        """In place filtering of video to finalize"""
        assert isinstance(vo, vipy.video.Scene)

        tofinalize = set([ai for (ai,a) in vo.activities().items() if (endframe is None or a.endframe() <= endframe) and (startframe is None or a.endframe() >= startframe)])
        tofinalize = tofinalize.union([ti for (ti,t) in vo.tracks().items() if ((endframe is None or t.endframe() <= endframe) and (startframe is None or t.endframe() >= startframe)) or any([ti == vo.activities(id=ai).actorid() for ai in tofinalize])])

        # Bad tracks:  Remove low confidence or too short non-moving tracks, and associated activities
        # - will throw exception that 'vo referenced before assignment' if one loop did not succceed
        if trackconf is not None:
            vo.trackfilter(lambda t: t.id() not in tofinalize or len(t)>=(mintracklen if mintracklen is not None else vo.framerate()) and (t.confidence() >= trackconf or t.startbox().iou(t.endbox()) == 0)).activityfilter(lambda a: a.id() not in tofinalize or a.actorid() in vo.tracks())  
        
        # Activity probability:  noun_probability*verb probability
        nounconf = {k:t.confidence(samples=8) for (k,t) in vo.tracks().items() if t.id() in tofinalize}   # 
        vo.activitymap(lambda a: a.confidence(nounconf[a.actorid()]*a.confidence()) if a.id() in tofinalize else a)
        
        # Missing objects:  Significantly reduce confidence of complex classes (yuck)
        vo.activitymap(lambda a: a.confidence(0.01*a.confidence()) if (a.id() in tofinalize and a.category() in ['person_purchases']) else a) 
        
        # Vehicle turns:  High confidence vehicle turns must be a minimum angle
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      (a.category() in ['vehicle_turns_left', 'vehicle_turns_right']) and
                                                                      (abs(vo.track(a.actorid()).bearing_change(a.startframe(), a.endframe(), dt=vo.framerate(), samples=5)) < (np.pi/16))) else a) 
        
        # Vehicle turns:  U-turn can only be distinguished from left/right turn at the end of a track by looking at the turn angle
        vo.activitymap(lambda a: a.category('vehicle_makes_u_turn').shortlabel('u turn') if (a.id() in tofinalize and
                                                                                             (a.category() in ['vehicle_turns_left', 'vehicle_turns_right']) and
                                                                                             (abs(vo.track(a.actorid()).bearing_change(a.startframe(), a.endframe(), dt=vo.framerate(), samples=5)) > (np.pi-(np.pi/2)))) else a)
        
        # Background activities:  Use logistic confidence on logit due to lack of background class "person stands", otherwise every standing person is using a phone
        if self._calibrated_constant is not None:            
            f_logistic = lambda x,b,s=1.0: float(1.0 / (1.0 + np.exp(-s*(x + b))))
            vo.activitymap(lambda a: a.confidence(a.confidence()*f_logistic(a.attributes['logit'], self._calibrated_constant)) if a.id() in tofinalize else a)  
            
        # Complex activities: remove steal/abandon and replace with picks up / puts down
        vo.activityfilter(lambda a: a.category() not in ['person_steals_object', 'person_abandons_package'])
        newlist = [vo.add(vipy.activity.Activity(startframe=a.startframe(), endframe=a.endframe(), category='person_steals_object', shortlabel='steals', confidence=0.5*a.confidence(), framerate=vo.framerate(), actorid=a.actorid(), attributes={'pip':'person_picks_up_object'}))
                   for a in vo.activitylist() if a.category() == 'person_picks_up_object']
        newlist = [vo.add(vipy.activity.Activity(startframe=a.startframe(), endframe=a.endframe(), category='person_abandons_package', shortlabel='abandons', confidence=0.5*a.confidence(), framerate=vo.framerate(), actorid=a.actorid(), attributes={'pip':'person_puts_down_object'}))
                   for a in vo.activitylist() if a.category() == 'person_puts_down_object']
            
        # Vehicle/person interaction: 'vehicle_drops_off_person'/'vehicle_picks_up_person'  must be followed by car driving away/pulling up, must be accompanied by person track start/end
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      a.category() == 'vehicle_drops_off_person' and
                                                                      (not vo.track(a.actorid()).ismoving(a.middleframe(), a.endframe()+10*vo.framerate()) or
                                                                       not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), t.startframe(), t.startframe()+1) > 0 for t in vo.tracks().values()]))) else a)
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      a.category() == 'vehicle_picks_up_person' and
                                                                      (not vo.track(a.actorid()).ismoving(a.startframe()-10*vo.framerate(), a.middleframe()) or
                                                                       not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), t.endframe()-1, t.endframe()) > 0 for t in vo.tracks().values()]))) else a)
        
        # Person/Bicycle track: riding must be accompanied by an associated moving bicycle track
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.category() != 'person_rides_bicycle')
        bikelist = [vo.add(vipy.activity.Activity(startframe=t.startframe(), endframe=t.endframe(), category='person_rides_bicycle', shortlabel='rides', confidence=t.confidence(samples=8), framerate=vo.framerate(), actorid=t.id(), attributes={'pip':'person_rides_bicycle'}))
                    for (tk,t) in vo.tracks().items() if (t.id() in tofinalize and t.category() == 'bicycle' and t.ismoving())]
        
        # Person/Vehicle track: person/vehicle interaction must be accompanied by an associated stopped vehicle track
        dstbox = {k:vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for (k,a) in vo.activities().items() if (a.id() in tofinalize and a.category().startswith('person') and ('vehicle' in a.category() or 'trunk' in a.category()))}  # precompute            
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      (a.category().startswith('person') and ('vehicle' in a.category() or 'trunk' in a.category())) and
                                                                      not any([t.category() == 'vehicle' and 
                                                                               t.during(a.startframe()) and
                                                                               not t.ismoving(a.startframe(), a.endframe()) and
                                                                               t[a.startframe()].hasintersection(dstbox[a._id])
                                                                               for t in vo.tracks().values()])) else a)
        
        # Vehicle/Person track: vehicle/person interaction must be accompanied by an associated person track
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and (a.category().startswith('vehicle') and ('person' in a.category())) and not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), a._startframe, a._endframe) > 0 for t in vo.tracks().values()])) else a)
        
        # Person track: enter/exit scene cannot be at the image boundary
        boundary = vo.framebox().dilate(0.9)
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and a.category() == 'person_enters_scene_through_structure' and vo.track(a.actorid())[max(a.startframe(), vo.track(a.actorid()).startframe())].cover(boundary) < 1) else a)
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and a.category() == 'person_exits_scene_through_structure' and vo.track(a.actorid())[min(a.endframe(), vo.track(a.actorid()).endframe())].cover(boundary) < 1) else a)
        
        # Activity union:  Temporal gaps less than support should be merged into one activity detection for a single track
        # Activity union:  "Brief" breaks (<5 seconds) of confident activities should be merged into one activity detection for a single track
        briefmerge = set(['person_reads_document', 'person_interacts_with_laptop', 'person_talks_to_person', 'person_purchases', 'person_steals_object', 'person_talks_on_phone', 'person_texts_on_phone', 'person_rides_bicycle', 'person_carries_heavy_object', 'person', 'person_walks', 'vehicle', 'car_moves'])  
        merged = set([])
        mergeable_dets = [a for a in vo.activities().values() if a.id() in tofinalize and a.confidence() > 0.2]  # only mergeable detections
        mergeable_dets.sort(key=lambda a: a.startframe())  # in-place
        for a in mergeable_dets:
            for o in mergeable_dets:
                if ((o._startframe >= a._startframe) and (a._id != o._id) and (o._actorid == a._actorid) and (o._label == a._label) and (o._id not in merged) and (a._id not in merged) and
                    ((a.temporal_distance(o) <= self.temporal_support() or (a.category() in briefmerge and a.temporal_distance(o) < 5*vo.framerate())))):
                    a.union(o)  # in-place update
                    merged.add(o.id())
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.id() not in merged)

        # Group activity: Must be accompanied by a friend with the same activity detection
        categories = ['person_embraces_person', 'hand_interacts_with_person', 'person_talks_to_person', 'person_transfers_object']           
        dstbox = {k:vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for (k,a) in vo.activities().items() if a.id() in tofinalize and a.category() in categories}  # precompute
        srcbox = {k:bb.clone().maxsquare().dilate(1.2) for (k,bb) in dstbox.items()}                            
        vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                      a._label in categories and
                                                                      not any([(af._label == a._label and
                                                                                af._id != a._id and
                                                                                af._actorid != a._actorid and 
                                                                                srcbox[a._id].hasintersection(dstbox[af._id]))
                                                                               for af in vo.activities().values() if af.during_interval(a._startframe, a._endframe, inclusive=True)])) else a)
        
        # Activity group suppression:  Group activities may have at most one activity detection of this type per group in a spatial region surrounding the actor
        tosuppress = set(['hand_interacts_with_person', 'person_embraces_person', 'person_transfers_object', 'person_steals_object', 'person_purchases', 'person_talks_to_person'])
        suppressed = set([])
        activitybox = {a.id():vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for a in vo.activities().values() if a.id() in tofinalize and a.category() in tosuppress}
        activitybox = {k:bb.dilate(1.2).maxsquare() if bb is not None else bb for (k,bb) in activitybox.items()}
        candidates = [a for a in vo.activities().values() if a.id() in tofinalize]
        for a in sorted(candidates, key=lambda a: a.confidence(), reverse=True):  # decreasing confidence
            if a.category() in tosuppress:
                for o in candidates:  # other activities
                    if (o._actorid != a._actorid and  # different tracks
                        o._label == a._label and  # same category
                        o.confidence() <= a.confidence() and   # lower confidence
                        o._id not in suppressed and  # not already suppressed
                        o.during_interval(a.startframe(), a.endframe()) and # overlaps temporally by at least one frame
                        (activitybox[a._id] is not None and activitybox[o._id] is not None) and   # has valid tracks
                        activitybox[a._id].hasintersection(activitybox[o._id]) and  # has coarse overlap 
                        vo.track(a.actorid()).clone().maxsquare().dilate(1.2).segment_maxiou(vo.track(o.actorid()), a.startframe(), a.endframe()) > 0):  # has fine overlap "close by"
                        suppressed.add(o.id())  # greedy non-maximum suppression of lower confidence activity detection
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.id() not in suppressed)

        # Activity duration
        vo.activitymap(lambda a: a.padto(5) if a.id() in tofinalize and a.category() in ['person_talks_to_person', 'person_interacts_with_laptop', 'person_reads_document', 'person_purchases'] else a)   
        vo.activitymap(lambda a: a.duration(2, centered=False) if a.id() in tofinalize and a.category() in ['person_opens_vehicle_door', 'person_closes_vehicle_door'] else a)
        vo.activitymap(lambda a: a.duration(2, centered=True) if a.id() in tofinalize and a.category() in ['person_enters_scene_through_structure', 'person_exits_scene_through_structure'] else a)
        vo.activitymap(lambda a: a.startframe(0) if a.id() in tofinalize and a.startframe() < 0 else a)

        # Activity confidence
        if activityconf is not None:
            vo.activityfilter(lambda a: a.id() not in tofinalize or a.confidence() >= activityconf)
    
        return vo

        
    def __call__(self, vi, activityiou=0.1, mirror=False, minprob=0.04, trackconf=0.2, maxdets=105, avgdets=70, throttle=True, buffered=True, finalized=True, mintracklen=None):
        (n,m,dt) = (self.temporal_support(), self.temporal_stride(), 1)  
        aa = self._allowable_activities  # dictionary mapping of allowable classified activities to output names        
        f_encode = self.totensor(training=False, validation=False, show=False, doflip=False)  # video -> tensor CxNxHxW
        f_mirror = lambda t: (t, torch.from_numpy(np.copy(np.flip(np.asarray(t), axis=3))))  # CxNxHxW -> CxNxHx(-W), np.flip is much faster than torch.flip, faster than encode mirror=True, np.flip returns a view which must be copied
        f_totensor = lambda v: (f_encode(v.clone(sharedarray=True) if mirror else v),) if (not mirror or v.actor().category() != 'person') else f_mirror(f_encode(v.clone(sharedarray=True)))  # do not mirror vehicle activities
        f_totensorlist = lambda V: [t for v in V for t in f_totensor(v)]        
        def f_reduce(T,V):
            j = sum([v.actor().category() == 'person' for v in V])  # person mirrored, vehicle not mirrored
            (tm, t) = torch.split(T, (2*j, len(T)-2*j), dim=0)  # assumes sorted order, person first, only person/vehicle
            return torch.cat((torch.mean(tm.view(-1, 2, tm.shape[1]), dim=1), t), dim=0) if j>0 else T  # mean over mirror augmentation

        try:
            with torch.no_grad():                                
                vp = next(vi)  # peek in generator to create clip
                vi = itertools.chain([vp], vi)  # unpeek
                sw = vipy.util.Stopwatch() if throttle else None  # real-time framerate estimate
                framerate = vp.framerate()
                for (k, (vo,vc)) in enumerate(zip(vi, vp.stream(buffered=buffered).clip(n, m, continuous=True, activities=False, delay=dt))):
                    videotracks = [] if vc is None else [vt for vt in vc.trackfilter(lambda t: len(t)>=4 and (t.category() == 'person' or (t.category() == 'vehicle' and vo.track(t.id()).ismoving(k-10*n+dt, k+dt)))).tracksplit()]  # vehicle moved recently?
                    if throttle:
                        videotracks.sort(key=lambda v: v.actor().confidence(last=1))  # in-place                                            
                        numdets = (maxdets if ((avgdets is None) or (sw.duration()<=60) or ((sw.duration()>60) and ((k/sw.duration())/vp.framerate())>0.8)) else
                                   (avgdets if ((k/sw.duration())/vp.framerate())>0.67 else int(avgdets//2)))   # real-time throttle schedule
                        videotracks = videotracks[-numdets:] if (numdets is not None and len(videotracks)>numdets) else videotracks   # select only the most confident for detection                
                    videotracks.sort(key=lambda v: v.actor().category())  # in-place, for grouping mirrored encoding: person<vehicle

                    if len(videotracks)>0 and (k+dt > n): 
                        logits = self.forward(torch.stack(f_totensorlist(videotracks))) # augmented logits in track index order, copy
                        logits = f_reduce(logits, videotracks) if mirror else logits  # reduced logits in track index order
                        (actorid, actorcategory) = ([t.actorid() for t in videotracks], [t.actor().category() for t in videotracks])
                        dets = [vipy.activity.Activity(category=aa[category], shortlabel=self._class_to_shortlabel[category], startframe=k-n+dt, endframe=k+dt, confidence=sm, framerate=framerate, actorid=actorid[j], attributes={'pip':category, 'logit':float(logit)})
                                for (j, category_sm_logit) in enumerate(self.softmax(logits))  # (classname, softmax, logit), unsorted
                                for (category, sm, logit) in category_sm_logit
                                if ((category in aa) and   # requested activities only
                                    (actorcategory[j] in self._verb_to_noun[category]) and   # noun matching with category renaming dictionary
                                    sm>=minprob)]   # minimum probability for new activity detection
                        vo.assign(k+dt, dets, activityiou=activityiou, activitymerge=False, activitynms=True)   # assign new activity detections by non-maximum suppression (merge happens at the end)
                        del logits, dets, videotracks  # torch garabage collection

                    if not isinstance(finalized, bool) and k > 0 and k%finalized == 0:
                        self.finalize(vo, trackconf=trackconf, startframe=k-finalized-5, endframe=k-5, mintracklen=mintracklen)  
                        
                    yield vo

        except Exception as e:                
            raise

        finally:
            if not (finalized is False):
                self.finalize(vo, trackconf=trackconf, mintracklen=mintracklen) if finalized == True else self.finalize(vo, trackconf=trackconf, startframe=(k//finalized)*finalized-4, endframe=k, mintracklen=mintracklen)

Ancestors

  • PIP_370k
  • PIP_250k
  • pytorch_lightning.core.lightning.LightningModule
  • abc.ABC
  • pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
  • pytorch_lightning.core.grads.GradInformation
  • pytorch_lightning.core.saving.ModelIO
  • pytorch_lightning.core.hooks.ModelHooks
  • pytorch_lightning.core.hooks.DataHooks
  • pytorch_lightning.core.hooks.CheckpointHooks
  • torch.nn.modules.module.Module
  • ActivityRecognition

Subclasses

Class variables

var dump_patches : bool
var training : bool

Methods

def finalize(self, vo, trackconf=None, activityconf=None, startframe=None, endframe=None, mintracklen=None)

In place filtering of video to finalize

Expand source code Browse git
def finalize(self, vo, trackconf=None, activityconf=None, startframe=None, endframe=None, mintracklen=None):
    """In place filtering of video to finalize"""
    assert isinstance(vo, vipy.video.Scene)

    tofinalize = set([ai for (ai,a) in vo.activities().items() if (endframe is None or a.endframe() <= endframe) and (startframe is None or a.endframe() >= startframe)])
    tofinalize = tofinalize.union([ti for (ti,t) in vo.tracks().items() if ((endframe is None or t.endframe() <= endframe) and (startframe is None or t.endframe() >= startframe)) or any([ti == vo.activities(id=ai).actorid() for ai in tofinalize])])

    # Bad tracks:  Remove low confidence or too short non-moving tracks, and associated activities
    # - will throw exception that 'vo referenced before assignment' if one loop did not succceed
    if trackconf is not None:
        vo.trackfilter(lambda t: t.id() not in tofinalize or len(t)>=(mintracklen if mintracklen is not None else vo.framerate()) and (t.confidence() >= trackconf or t.startbox().iou(t.endbox()) == 0)).activityfilter(lambda a: a.id() not in tofinalize or a.actorid() in vo.tracks())  
    
    # Activity probability:  noun_probability*verb probability
    nounconf = {k:t.confidence(samples=8) for (k,t) in vo.tracks().items() if t.id() in tofinalize}   # 
    vo.activitymap(lambda a: a.confidence(nounconf[a.actorid()]*a.confidence()) if a.id() in tofinalize else a)
    
    # Missing objects:  Significantly reduce confidence of complex classes (yuck)
    vo.activitymap(lambda a: a.confidence(0.01*a.confidence()) if (a.id() in tofinalize and a.category() in ['person_purchases']) else a) 
    
    # Vehicle turns:  High confidence vehicle turns must be a minimum angle
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                  (a.category() in ['vehicle_turns_left', 'vehicle_turns_right']) and
                                                                  (abs(vo.track(a.actorid()).bearing_change(a.startframe(), a.endframe(), dt=vo.framerate(), samples=5)) < (np.pi/16))) else a) 
    
    # Vehicle turns:  U-turn can only be distinguished from left/right turn at the end of a track by looking at the turn angle
    vo.activitymap(lambda a: a.category('vehicle_makes_u_turn').shortlabel('u turn') if (a.id() in tofinalize and
                                                                                         (a.category() in ['vehicle_turns_left', 'vehicle_turns_right']) and
                                                                                         (abs(vo.track(a.actorid()).bearing_change(a.startframe(), a.endframe(), dt=vo.framerate(), samples=5)) > (np.pi-(np.pi/2)))) else a)
    
    # Background activities:  Use logistic confidence on logit due to lack of background class "person stands", otherwise every standing person is using a phone
    if self._calibrated_constant is not None:            
        f_logistic = lambda x,b,s=1.0: float(1.0 / (1.0 + np.exp(-s*(x + b))))
        vo.activitymap(lambda a: a.confidence(a.confidence()*f_logistic(a.attributes['logit'], self._calibrated_constant)) if a.id() in tofinalize else a)  
        
    # Complex activities: remove steal/abandon and replace with picks up / puts down
    vo.activityfilter(lambda a: a.category() not in ['person_steals_object', 'person_abandons_package'])
    newlist = [vo.add(vipy.activity.Activity(startframe=a.startframe(), endframe=a.endframe(), category='person_steals_object', shortlabel='steals', confidence=0.5*a.confidence(), framerate=vo.framerate(), actorid=a.actorid(), attributes={'pip':'person_picks_up_object'}))
               for a in vo.activitylist() if a.category() == 'person_picks_up_object']
    newlist = [vo.add(vipy.activity.Activity(startframe=a.startframe(), endframe=a.endframe(), category='person_abandons_package', shortlabel='abandons', confidence=0.5*a.confidence(), framerate=vo.framerate(), actorid=a.actorid(), attributes={'pip':'person_puts_down_object'}))
               for a in vo.activitylist() if a.category() == 'person_puts_down_object']
        
    # Vehicle/person interaction: 'vehicle_drops_off_person'/'vehicle_picks_up_person'  must be followed by car driving away/pulling up, must be accompanied by person track start/end
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                  a.category() == 'vehicle_drops_off_person' and
                                                                  (not vo.track(a.actorid()).ismoving(a.middleframe(), a.endframe()+10*vo.framerate()) or
                                                                   not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), t.startframe(), t.startframe()+1) > 0 for t in vo.tracks().values()]))) else a)
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                  a.category() == 'vehicle_picks_up_person' and
                                                                  (not vo.track(a.actorid()).ismoving(a.startframe()-10*vo.framerate(), a.middleframe()) or
                                                                   not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), t.endframe()-1, t.endframe()) > 0 for t in vo.tracks().values()]))) else a)
    
    # Person/Bicycle track: riding must be accompanied by an associated moving bicycle track
    vo.activityfilter(lambda a: a.id() not in tofinalize or a.category() != 'person_rides_bicycle')
    bikelist = [vo.add(vipy.activity.Activity(startframe=t.startframe(), endframe=t.endframe(), category='person_rides_bicycle', shortlabel='rides', confidence=t.confidence(samples=8), framerate=vo.framerate(), actorid=t.id(), attributes={'pip':'person_rides_bicycle'}))
                for (tk,t) in vo.tracks().items() if (t.id() in tofinalize and t.category() == 'bicycle' and t.ismoving())]
    
    # Person/Vehicle track: person/vehicle interaction must be accompanied by an associated stopped vehicle track
    dstbox = {k:vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for (k,a) in vo.activities().items() if (a.id() in tofinalize and a.category().startswith('person') and ('vehicle' in a.category() or 'trunk' in a.category()))}  # precompute            
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                  (a.category().startswith('person') and ('vehicle' in a.category() or 'trunk' in a.category())) and
                                                                  not any([t.category() == 'vehicle' and 
                                                                           t.during(a.startframe()) and
                                                                           not t.ismoving(a.startframe(), a.endframe()) and
                                                                           t[a.startframe()].hasintersection(dstbox[a._id])
                                                                           for t in vo.tracks().values()])) else a)
    
    # Vehicle/Person track: vehicle/person interaction must be accompanied by an associated person track
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and (a.category().startswith('vehicle') and ('person' in a.category())) and not any([t.category() == 'person' and t.segment_maxiou(vo.track(a._actorid), a._startframe, a._endframe) > 0 for t in vo.tracks().values()])) else a)
    
    # Person track: enter/exit scene cannot be at the image boundary
    boundary = vo.framebox().dilate(0.9)
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and a.category() == 'person_enters_scene_through_structure' and vo.track(a.actorid())[max(a.startframe(), vo.track(a.actorid()).startframe())].cover(boundary) < 1) else a)
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and a.category() == 'person_exits_scene_through_structure' and vo.track(a.actorid())[min(a.endframe(), vo.track(a.actorid()).endframe())].cover(boundary) < 1) else a)
    
    # Activity union:  Temporal gaps less than support should be merged into one activity detection for a single track
    # Activity union:  "Brief" breaks (<5 seconds) of confident activities should be merged into one activity detection for a single track
    briefmerge = set(['person_reads_document', 'person_interacts_with_laptop', 'person_talks_to_person', 'person_purchases', 'person_steals_object', 'person_talks_on_phone', 'person_texts_on_phone', 'person_rides_bicycle', 'person_carries_heavy_object', 'person', 'person_walks', 'vehicle', 'car_moves'])  
    merged = set([])
    mergeable_dets = [a for a in vo.activities().values() if a.id() in tofinalize and a.confidence() > 0.2]  # only mergeable detections
    mergeable_dets.sort(key=lambda a: a.startframe())  # in-place
    for a in mergeable_dets:
        for o in mergeable_dets:
            if ((o._startframe >= a._startframe) and (a._id != o._id) and (o._actorid == a._actorid) and (o._label == a._label) and (o._id not in merged) and (a._id not in merged) and
                ((a.temporal_distance(o) <= self.temporal_support() or (a.category() in briefmerge and a.temporal_distance(o) < 5*vo.framerate())))):
                a.union(o)  # in-place update
                merged.add(o.id())
    vo.activityfilter(lambda a: a.id() not in tofinalize or a.id() not in merged)

    # Group activity: Must be accompanied by a friend with the same activity detection
    categories = ['person_embraces_person', 'hand_interacts_with_person', 'person_talks_to_person', 'person_transfers_object']           
    dstbox = {k:vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for (k,a) in vo.activities().items() if a.id() in tofinalize and a.category() in categories}  # precompute
    srcbox = {k:bb.clone().maxsquare().dilate(1.2) for (k,bb) in dstbox.items()}                            
    vo.activitymap(lambda a: a.confidence(0.1*a.confidence()) if (a.id() in tofinalize and
                                                                  a._label in categories and
                                                                  not any([(af._label == a._label and
                                                                            af._id != a._id and
                                                                            af._actorid != a._actorid and 
                                                                            srcbox[a._id].hasintersection(dstbox[af._id]))
                                                                           for af in vo.activities().values() if af.during_interval(a._startframe, a._endframe, inclusive=True)])) else a)
    
    # Activity group suppression:  Group activities may have at most one activity detection of this type per group in a spatial region surrounding the actor
    tosuppress = set(['hand_interacts_with_person', 'person_embraces_person', 'person_transfers_object', 'person_steals_object', 'person_purchases', 'person_talks_to_person'])
    suppressed = set([])
    activitybox = {a.id():vo.track(a.actorid()).boundingbox(a.startframe(), a.endframe()) for a in vo.activities().values() if a.id() in tofinalize and a.category() in tosuppress}
    activitybox = {k:bb.dilate(1.2).maxsquare() if bb is not None else bb for (k,bb) in activitybox.items()}
    candidates = [a for a in vo.activities().values() if a.id() in tofinalize]
    for a in sorted(candidates, key=lambda a: a.confidence(), reverse=True):  # decreasing confidence
        if a.category() in tosuppress:
            for o in candidates:  # other activities
                if (o._actorid != a._actorid and  # different tracks
                    o._label == a._label and  # same category
                    o.confidence() <= a.confidence() and   # lower confidence
                    o._id not in suppressed and  # not already suppressed
                    o.during_interval(a.startframe(), a.endframe()) and # overlaps temporally by at least one frame
                    (activitybox[a._id] is not None and activitybox[o._id] is not None) and   # has valid tracks
                    activitybox[a._id].hasintersection(activitybox[o._id]) and  # has coarse overlap 
                    vo.track(a.actorid()).clone().maxsquare().dilate(1.2).segment_maxiou(vo.track(o.actorid()), a.startframe(), a.endframe()) > 0):  # has fine overlap "close by"
                    suppressed.add(o.id())  # greedy non-maximum suppression of lower confidence activity detection
    vo.activityfilter(lambda a: a.id() not in tofinalize or a.id() not in suppressed)

    # Activity duration
    vo.activitymap(lambda a: a.padto(5) if a.id() in tofinalize and a.category() in ['person_talks_to_person', 'person_interacts_with_laptop', 'person_reads_document', 'person_purchases'] else a)   
    vo.activitymap(lambda a: a.duration(2, centered=False) if a.id() in tofinalize and a.category() in ['person_opens_vehicle_door', 'person_closes_vehicle_door'] else a)
    vo.activitymap(lambda a: a.duration(2, centered=True) if a.id() in tofinalize and a.category() in ['person_enters_scene_through_structure', 'person_exits_scene_through_structure'] else a)
    vo.activitymap(lambda a: a.startframe(0) if a.id() in tofinalize and a.startframe() < 0 else a)

    # Activity confidence
    if activityconf is not None:
        vo.activityfilter(lambda a: a.id() not in tofinalize or a.confidence() >= activityconf)

    return vo
def forward(self, x) ‑> Callable[..., Any]

Overload forward for multi-gpu batch. Don't use torch DataParallel!

Expand source code Browse git
def forward(self, x):
    """Overload forward for multi-gpu batch.  Don't use torch DataParallel!"""
    if self._gpus is None:
        return super().forward(x)  # cpu
    else:
        x_forward = None            
        for b in x.pin_memory().split(self._batchsize_per_gpu*len(self._gpus)):  # pinned copy
            n_todevice = np.sum(np.array([1 if k<len(b) else 0 for k in range(int(len(self._devices)*np.ceil(len(b)/len(self._devices))))]).reshape(-1, len(self._devices)), axis=0).tolist()
            todevice = [t.to(d, non_blocking=True) for (t,d) in zip(b.split(n_todevice), self._devices) if len(t)>0]   # async device copy
            ondevice = [m(t) for (m,t) in zip(self._gpus, todevice)]   # async
            fromdevice = torch.cat([t.cpu() for t in ondevice], dim=0)
            x_forward = fromdevice if x_forward is None else torch.cat((x_forward, fromdevice), dim=0)
            del ondevice, todevice, fromdevice, b  # force garbage collection of GPU memory
        del x  # force garbage collection
        return x_forward
def logit_pooling(self, x_logits, classmap)
Expand source code Browse git
def logit_pooling(self, x_logits, classmap):
    if self._logit_pooling is None:
        assert isinstance(classmap, dict)
        index_to_class = {k:c for (k,c) in enumerate(set([k if v is None else v for (k,v) in classmap.items()]))}  # target class index
        class_to_index = {v:k for (k,v) in index_to_class.items()}  # target class to index
        class_to_group = [[self.class_to_index(c[0]) for c in v if c[0] in self.class_to_index()] for (k,v) in sorted(vipy.util.groupbyasdict([(k,k) if v is None else (k,v) for (k,v) in classmap.items()], lambda x: x[1]).items(), key=lambda x: class_to_index[x[0]])]  # target class to source index
        self._logit_pooling = {'index_to_class': index_to_class, 'class_to_group': class_to_group}
    (index_to_class, class_to_group) = (self._logit_pooling['index_to_class'], self._logit_pooling['class_to_group'])  # cached
    yh = torch.tensor([max(r[j]) if len(r[j])>0 else (min(r)) for (i,r) in enumerate(x_logits) for j in class_to_group]).reshape(len(x_logits), len(index_to_class)).detach()
    yh_softmax = F.softmax(yh, dim=1).detach().cpu()
    return [[(index_to_class[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh.cpu(), yh_softmax)]
def lrt(self, x_logits, lrt_threshold=None)

top-k with likelihood ratio test with background null hypothesis

Expand source code Browse git
def lrt(self, x_logits, lrt_threshold=None):
    """top-k with likelihood ratio test with background null hypothesis"""
    j_bg_person = self._class_to_index['person'] if 'person' in self._class_to_index else self._class_to_index['person_walks']  # FIXME
    j_bg_vehicle = self._class_to_index['vehicle'] if 'vehicle' in self._class_to_index else self._class_to_index['car_moves']  # FIXME

    yh = x_logits.detach().cpu().numpy()
    yh_softmax = F.softmax(x_logits, dim=1).detach().cpu()
    p_null = np.maximum(yh[:, j_bg_person], yh[:, j_bg_vehicle]).reshape(yh.shape[0], 1)
    lr = yh - p_null   # ~= log likelihood ratio
    f_logistic = lambda x,b,s=1.0: float(1.0 / (1.0 + np.exp(-s*(x + b))))
    return [sorted([(self.index_to_class(j), float(s[j]), float(t[j]), f_logistic(s[j], 1.0)*f_logistic(t[j], 0.0), float(sm[j])) for j in range(len(s)) if (lrt_threshold is None or t[j] >= lrt_threshold)], key=lambda x: x[3], reverse=True) for (s,t,sm) in zip(yh, lr, yh_softmax)]
def softmax(self, x_logits)

Return a list of lists [(class_label, float(softmax), float(logit) … ] for all classes and batches

Expand source code Browse git
def softmax(self, x_logits):
    """Return a list of lists [(class_label, float(softmax), float(logit) ... ] for all classes and batches"""
    yh = x_logits.detach().cpu().numpy()
    yh_softmax = F.softmax(x_logits, dim=1).detach().cpu()
    d = self.index_to_class()
    if not self._calibrated:
        return [[(d[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh, yh_softmax)]
    else:
        yh_softmax = self.calibration(x_logits)
        return [[(d[j], float(sm[j]), float(s[j])) for j in range(len(sm))] for (s,sm) in zip(yh, yh_softmax)]            
def temporal_stride(self, s=None)
Expand source code Browse git
def temporal_stride(self, s=None):
    if s is not None:
        self._stride = s
        return self
    else:
        return self._stride

Inherited members

class ActivityTrackerCap (stride=3, activities=None, gpus=None, batchsize=None, calibrated=False, modelfile=None, calibrated_constant=None, unitnorm=False)

Video Activity detection.

Args (call): vi [generator of vipy.video.Scene]: The input video to be updated in place with detections. This is a generator which is output from heyvi.detection.MultiscaleVideoTracker.call activityiou [float]: The minimum temporal iou for activity assignment mirror [bool]: If true, encode using the mean of a video encoding and the mirrored video encoding. This is slower as it requires 2x GPU forward passes minprob [float]: The minimum probability for new activity detection trackconf [float]: The minimum object detection confidence for new tracks maxdets [int]: The maximum number of allowable detections per frame. If there are more detections per frame tha maxdets, sort them by confidence and use only the top maxdets best avgdets [int]: The number of allowable detections per frame if throttled buffered [bool]: If true, then buffer streams. This is useful for activity detection on live streams.
finalized [bool, int]: If False then do not finalize(), If True finalize() only at the end, If int, then finalize every int frames. This is useful for streaming activity detection on unbounded inputs.

Returns

The input video is updated in place.

Expand source code Browse git
class ActivityTrackerCap(ActivityTracker, CAP):
    def __init__(self, stride=3, activities=None, gpus=None, batchsize=None, calibrated=False, modelfile=None, calibrated_constant=None, unitnorm=False):
        ActivityTracker. __init__(self, stride=stride, activities=activities, gpus=gpus, batchsize=batchsize, mlbl=False, mlfl=True, modelfile=modelfile)
        CAP.__init__(self, modelfile=modelfile, deterministic=False, pretrained=None, mlbl=None, mlfl=True, calibrated_constant=calibrated_constant, calibrated=calibrated, unitnorm=unitnorm)

Ancestors

  • ActivityTracker
  • CAP
  • PIP_370k
  • PIP_250k
  • pytorch_lightning.core.lightning.LightningModule
  • abc.ABC
  • pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
  • pytorch_lightning.core.grads.GradInformation
  • pytorch_lightning.core.saving.ModelIO
  • pytorch_lightning.core.hooks.ModelHooks
  • pytorch_lightning.core.hooks.DataHooks
  • pytorch_lightning.core.hooks.CheckpointHooks
  • torch.nn.modules.module.Module
  • ActivityRecognition

Class variables

var dump_patches : bool
var training : bool

Inherited members

class CAP (modelfile=None, deterministic=False, pretrained=None, mlbl=None, mlfl=True, calibrated_constant=None, calibrated=False, unitnorm=False, bgbce=False)

Activity recognition using people in public - 250k stabilized

Expand source code Browse git
class CAP(PIP_370k, pl.LightningModule, ActivityRecognition):
    def __init__(self, modelfile=None, deterministic=False, pretrained=None, mlbl=None, mlfl=True, calibrated_constant=None, calibrated=False, unitnorm=False, bgbce=False):
        pl.LightningModule.__init__(self)
        ActivityRecognition.__init__(self)  

        self._input_size = 112
        self._num_frames = 16        
        self._mean = [0.485, 0.456, 0.406]
        self._std = [0.229, 0.224, 0.225]
        self._mlfl = True
        self._mlbl = False
        self._calibrated_constant = calibrated_constant
        self._calibrated = calibrated
        self._unitnorm = unitnorm
        self._bgbce = bgbce

        if deterministic:
            np.random.seed(42)

        version = 5  
        
        if version == 1:
            print('[heyvi.recognition.CAP]: version == 1')  # cap_l2norm_e23s96095.ckpt and earlier

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            # - WARNING: under-represented classes are truncated at a maximum weight of one
            # - python 3.7 can use importlib.resources
            self._class_to_training_weight = {k:float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_training_weight.csv'))}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility
            
            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class.csv'))}
            self._index_to_training_weight = {self._class_to_index[k]:v for (k,v) in self._class_to_weight.items()}
            
            # Generated using vipy.dataset.Dataset.class_to_shortlabel()
            self._class_to_shortlabel = dict(vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_shortlabel.csv')))
            self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )

        elif version == 2:
            print('[heyvi.recognition.CAP]: version==2')

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            # - WARNING: under-represented classes are truncated at a maximum weight of one
            self._index_to_training_weight = {int(k):float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'coarse_index_to_training_weight.csv'))}

            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'coarse_class_to_index.csv'))}
            self._index_to_class = {int(k):v for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'coarse_index_to_class.csv'))}

            # Derived
            self._class_to_training_weight = {k:self._index_to_training_weight[v] for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility

        elif version == 3:
            print('[heyvi.recognition.CAP]: version==3')

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            self._index_to_training_weight = {int(k):float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_index_to_training_weight.csv'))}

            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_class_to_index.csv'))}
            self._index_to_class = {int(k):v for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_index_to_class.csv'))}

            # Derived
            self._class_to_training_weight = {k:self._index_to_training_weight[v] for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility

        elif version == 4:
            print('[heyvi.recognition.CAP]: version==4')

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            self._index_to_training_weight = {int(k):float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'joint_background_index_to_training_weight.csv'))}

            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_class_to_index.csv'))}
            self._index_to_class = {int(k):v for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'background_index_to_class.csv'))}

            # Derived
            self._class_to_training_weight = {k:self._index_to_training_weight[v] for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility
            
        elif version == 5:
            print('[heyvi.recognition.CAP]: version == 5') 

            # Generated using vipy.dataset.Dataset.multilabel_inverse_frequency_weight()
            # - WARNING: under-represented classes are truncated at a maximum weight of one
            # - python 3.7 can use importlib.resources
            self._index_to_training_weight = {k:float(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'index_to_meva_training_weight.csv'))}
            
            # Generated using vipy.dataset.Dataset.class_to_index()
            self._class_to_index = {k:int(v) for (k,v) in vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class.csv'))}
            self._class_to_training_weight = {k:self._index_to_training_weight[v] if v in self._index_to_training_weight else 0 for (k,v) in self._class_to_index.items()}
            self._class_to_weight = self._class_to_training_weight  # backwards compatibility            

            # Generated using vipy.dataset.Dataset.class_to_shortlabel()
            self._class_to_shortlabel = dict(vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_shortlabel.csv')))
            self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )

        else:
            raise
            
        # Generated using vipy.dataset.Dataset.class_to_shortlabel()
        self._class_to_shortlabel = dict(vipy.util.readcsv(os.path.join(os.path.dirname(heyvi.__file__), 'model', 'cap', 'class_to_shortlabel.csv')))
        self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )


        # Calibration state: trained at validation epoch end
        if self._calibrated:
            self.register_buffer('_calibration_multiclass', torch.zeros(1,1))
            self.register_buffer('_calibration_binary', torch.zeros(3,self.num_classes()))

        if modelfile is not None:
            self._load_trained(modelfile)
        else:
            self._load_pretrained()
            self.net.fc = nn.Linear(self.net.fc.in_features, self.num_classes())

    
    #---- <LIGHTNING>
    def forward(self, x):
        return self.net(x)  

    def validation_step(self, batch, batch_nb):
        s = self.training_step(batch, batch_nb, logging=False, valstep=True)
        self.log('val_loss', s['loss'], on_step=True, on_epoch=True, prog_bar=False, logger=True)
        return {'val_loss': s['loss'], 'logit': s['logit'], 'classindex':s['classindex']}

    def validation_step_end(self, outputs):
        return {'val_loss': outputs['val_loss'],  
                'logit': outputs['logit'],  # accumulated across GPUs (single machine multi-GPU only)
                'classindex': outputs['classindex']}

    def validation_epoch_end(self, outputs):        
        avg_loss = torch.cat([x['val_loss'].flatten() for x in outputs]).mean()
        self.log('val_loss', avg_loss, on_epoch=True, prog_bar=False, logger=True)
        self.log('avg_val_loss', avg_loss, on_epoch=True, prog_bar=True, logger=True)  # for checkpointing

        # Calibration: will be saved as registered buffer in checkpoint for calibration
        if self._calibrated and self.trainer.is_global_zero:
            from netcal.scaling import LogisticCalibration, TemperatureScaling
            logits = torch.stack([x for output in outputs for x in output['logit']]).detach().cpu().numpy()
            ground_truth = torch.cat([x.flatten() for output in outputs for x in output['classindex']]).flatten().detach().cpu().numpy()
            multiclass = TemperatureScaling()
            multiclass.fit(F.softmax(torch.from_numpy(logits), dim=1).cpu().numpy(), ground_truth)
            binary = {k:(LogisticCalibration(), float(np.mean(logits[:,k]))) for k in sorted(self.class_to_index().values())}
            for (k,(b,m)) in binary.items():
                (binary_confidences, binary_ground_truth) = (torch.sigmoid(torch.from_numpy(logits[:,k]-m).flatten()).cpu().numpy(), np.array([1 if y==k else 0 for y in ground_truth]))
                if np.any(binary_ground_truth):
                    b.fit(binary_confidences, binary_ground_truth)
                else:
                    binary[k] = (None,0)  # no samples for calibration, prediction will always be zero

            with torch.no_grad():
                calibration_multiclass = torch.tensor(float(multiclass._sites['weights']['values']))
                calibration_binary = torch.stack((torch.tensor([float(b._sites['weights']['values']) if b is not None else 0 for (k,(b,m)) in binary.items()]), 
                                                  torch.tensor([float(b._sites['bias']['values']) if b is not None else 0 for (k,(b,m)) in binary.items()]), 
                                                  torch.tensor([float(m) if b is not None else 0 for (k,(b,m)) in binary.items()])))
            
                self._calibration_multiclass[:] = calibration_multiclass.to(device=torch.device('cuda:0'))
                self._calibration_binary[:] = calibration_binary.to(device=torch.device('cuda:0'))
    #---- </LIGHTNING>
        
    def totensor(self, v=None, training=False, validation=False, show=False, doflip=False, asjson=False):
        """Return captured lambda function if v=None, else return tensor"""    
        assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
        f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show, classname=self.__class__.__name__, doflip=doflip, asjson=asjson:
             PIP_370k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right', 'vehicle_turns_left', 'vehicle_turns_right', 'motorcycle_turns_left', 'motorcycle_turns_right'], show=show, doflip=doflip, asjson=asjson, classname=classname))
        return f(v) if v is not None else f

    def calibration(self, x_logits):
        assert torch.is_tensor(self._calibration_multiclass) and self._calibration_multiclass.shape == (1,1)
        assert torch.is_tensor(self._calibration_binary) and self._calibration_binary.shape == (3, self.num_classes())
        (n, T, (w,b,o), eps) = (self.num_classes(), self._calibration_multiclass, self._calibration_binary, np.finfo(np.float64).eps)  # (TemperatureScaling, PlattScaling=(weight, bias, offset))

        lr = torch.from_numpy(np.multiply(1.0/(1+np.exp(-(np.multiply(np.array(w*0 + 1).reshape(1,n).astype(np.float64), scipy.special.logit(np.clip(torch.sigmoid(x_logits-o.view(1,n)).detach().cpu().numpy().astype(np.float64), eps, 1-eps))))+np.array(b).reshape(1,n).astype(np.float64))), (np.array(o)!=0).reshape(1,n)).astype(np.float32))
        sm = F.softmax(torch.log(torch.clamp(F.softmax(x_logits, dim=1), eps, 1-eps)) / T, dim=1)
        
        #sm = F.softmax(x_logits / T, dim=1)  # temperature only
        #lr = torch.multiply(torch.sigmoid(x_logits-o.view(1,n)+b.view(1,n)), (o!=0).view(1,n))   # bias only
        return torch.multiply(sm, lr)

Ancestors

  • PIP_370k
  • PIP_250k
  • pytorch_lightning.core.lightning.LightningModule
  • abc.ABC
  • pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
  • pytorch_lightning.core.grads.GradInformation
  • pytorch_lightning.core.saving.ModelIO
  • pytorch_lightning.core.hooks.ModelHooks
  • pytorch_lightning.core.hooks.DataHooks
  • pytorch_lightning.core.hooks.CheckpointHooks
  • torch.nn.modules.module.Module
  • ActivityRecognition

Subclasses

Class variables

var dump_patches : bool
var training : bool

Methods

def calibration(self, x_logits)
Expand source code Browse git
def calibration(self, x_logits):
    assert torch.is_tensor(self._calibration_multiclass) and self._calibration_multiclass.shape == (1,1)
    assert torch.is_tensor(self._calibration_binary) and self._calibration_binary.shape == (3, self.num_classes())
    (n, T, (w,b,o), eps) = (self.num_classes(), self._calibration_multiclass, self._calibration_binary, np.finfo(np.float64).eps)  # (TemperatureScaling, PlattScaling=(weight, bias, offset))

    lr = torch.from_numpy(np.multiply(1.0/(1+np.exp(-(np.multiply(np.array(w*0 + 1).reshape(1,n).astype(np.float64), scipy.special.logit(np.clip(torch.sigmoid(x_logits-o.view(1,n)).detach().cpu().numpy().astype(np.float64), eps, 1-eps))))+np.array(b).reshape(1,n).astype(np.float64))), (np.array(o)!=0).reshape(1,n)).astype(np.float32))
    sm = F.softmax(torch.log(torch.clamp(F.softmax(x_logits, dim=1), eps, 1-eps)) / T, dim=1)
    
    #sm = F.softmax(x_logits / T, dim=1)  # temperature only
    #lr = torch.multiply(torch.sigmoid(x_logits-o.view(1,n)+b.view(1,n)), (o!=0).view(1,n))   # bias only
    return torch.multiply(sm, lr)
def validation_step_end(self, outputs)

Use this when validating with dp or ddp2 because :meth:validation_step will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss.

Note

If you later switch to ddp or some other mode, this will still be called so that you don't have to change your code.

.. code-block:: python

# pseudocode
sub_batches = split_batches_for_dp(batch)
batch_parts_outputs = [validation_step(sub_batch) for sub_batch in sub_batches]
validation_step_end(batch_parts_outputs)

Args

batch_parts_outputs
What you return in :meth:validation_step for each batch part.

Return

None or anything

.. code-block:: python

# WITHOUT validation_step_end
# if used in DP or DDP2, this batch is 1/num_gpus large
def validation_step(self, batch, batch_idx):
    # batch is 1/num_gpus big
    x, y = batch

    out = self.encoder(x)
    loss = self.softmax(out)
    loss = nce_loss(loss)
    self.log('val_loss', loss)

# --------------
# with validation_step_end to do softmax over the full batch
def validation_step(self, batch, batch_idx):
    # batch is 1/num_gpus big
    x, y = batch

    out = self(x)
    return out

def validation_step_end(self, val_step_outputs):
    for out in val_step_outputs:
        # do something with these

See Also: See the :ref:multi_gpu guide for more details.

Expand source code Browse git
def validation_step_end(self, outputs):
    return {'val_loss': outputs['val_loss'],  
            'logit': outputs['logit'],  # accumulated across GPUs (single machine multi-GPU only)
            'classindex': outputs['classindex']}

Inherited members

class PIP_250k (pretrained=True, deterministic=False, modelfile=None, mlbl=False, mlfl=True, unitnorm=False, bgbce=False)

Activity recognition using people in public - 250k stabilized

Expand source code Browse git
class PIP_250k(pl.LightningModule, ActivityRecognition):
    """Activity recognition using people in public - 250k stabilized"""
    
    def __init__(self, pretrained=True, deterministic=False, modelfile=None, mlbl=False, mlfl=True, unitnorm=False, bgbce=False):

        # FIXME: remove dependencies here
        from heyvi.model.pyvideoresearch.bases.resnet50_3d import ResNet503D, ResNet3D, Bottleneck3D
        import heyvi.model.ResNets_3D_PyTorch.resnet
        
        super().__init__()
        self._input_size = 112
        self._num_frames = 16        
        self._mean = [0.485, 0.456, 0.406]
        self._std = [0.229, 0.224, 0.225]
        self._mlfl = mlfl
        self._mlbl = mlbl
        self._bgbce = bgbce
        self._unitnorm = unitnorm

        if deterministic:
            np.random.seed(42)

        self._class_to_weight = {'car_drops_off_person': 1.4162811344926518, 'car_picks_up_person': 1.4103618337303332, 'car_reverses': 1.0847976470131024, 'car_starts': 1.0145749063037774, 'car_stops': 0.6659236295324015, 'car_turns_left': 2.942269221156227, 'car_turns_right': 1.1077783089040996, 'hand_interacts_with_person_highfive': 2.793646013249904, 'person': 0.4492053391155403, 'person_abandons_object': 1.0944029463871692, 'person_carries_heavy_object': 0.5848339202761978, 'person_closes_car_door': 0.8616907697519004, 'person_closes_car_trunk': 1.468393359799126, 'person_closes_facility_door': 0.8927495923340439, 'person_embraces_person': 0.6072654081071569, 'person_enters_car': 1.3259274145537951, 'person_enters_scene_through_structure': 0.6928103470838287, 'person_exits_car': 1.6366577285051707, 'person_exits_scene_through_structure': 0.8368692178634396, 'person_holds_hand': 1.2378881634203558, 'person_interacts_with_laptop': 1.6276031281396193, 'person_loads_car': 2.170167410167583, 'person_opens_car_door': 0.7601817241565009, 'person_opens_car_trunk': 1.7255285914206204, 'person_opens_facility_door': 0.9167411017455822, 'person_picks_up_object_from_floor': 1.123251610875369, 'person_picks_up_object_from_table': 3.5979689180114205, 'person_purchases_from_cashier': 7.144918373837205, 'person_purchases_from_machine': 5.920886403645001, 'person_puts_down_object_on_floor': 0.7295795950752353, 'person_puts_down_object_on_shelf': 9.247614426653692, 'person_puts_down_object_on_table': 1.9884672074906158, 'person_reads_document': 0.7940480628992879, 'person_rides_bicycle': 2.662661823600623, 'person_shakes_hand': 0.7819547332927879, 'person_sits_down': 0.8375202893491961, 'person_stands_up': 1.0285510019795079, 'person_steals_object_from_person': 1.0673909796893626, 'person_talks_on_phone': 0.3031855242664589, 'person_talks_to_person': 0.334895684562076, 'person_texts_on_phone': 0.713951043919232, 'person_transfers_object_to_car': 3.2832615561297605, 'person_transfers_object_to_person': 0.9633429807282274, 'person_unloads_car': 1.1051597100801462, 'vehicle': 1.1953172363332243}
        self._class_to_weight['person_puts_down_object_on_shelf'] = 1.0   # run 5

        self._class_to_index = {'car_drops_off_person': 0, 'car_picks_up_person': 1, 'car_reverses': 2, 'car_starts': 3, 'car_stops': 4, 'car_turns_left': 5, 'car_turns_right': 6, 'hand_interacts_with_person_highfive': 7, 'person': 8, 'person_abandons_object': 9, 'person_carries_heavy_object': 10, 'person_closes_car_door': 11, 'person_closes_car_trunk': 12, 'person_closes_facility_door': 13, 'person_embraces_person': 14, 'person_enters_car': 15, 'person_enters_scene_through_structure': 16, 'person_exits_car': 17, 'person_exits_scene_through_structure': 18, 'person_holds_hand': 19, 'person_interacts_with_laptop': 20, 'person_loads_car': 21, 'person_opens_car_door': 22, 'person_opens_car_trunk': 23, 'person_opens_facility_door': 24, 'person_picks_up_object_from_floor': 25, 'person_picks_up_object_from_table': 26, 'person_purchases_from_cashier': 27, 'person_purchases_from_machine': 28, 'person_puts_down_object_on_floor': 29, 'person_puts_down_object_on_shelf': 30, 'person_puts_down_object_on_table': 31, 'person_reads_document': 32, 'person_rides_bicycle': 33, 'person_shakes_hand': 34, 'person_sits_down': 35, 'person_stands_up': 36, 'person_steals_object_from_person': 37, 'person_talks_on_phone': 38, 'person_talks_to_person': 39, 'person_texts_on_phone': 40, 'person_transfers_object_to_car': 41, 'person_transfers_object_to_person': 42, 'person_unloads_car': 43, 'vehicle': 44}

        self._verb_to_noun = {k:set(['car','vehicle','motorcycle','bus','truck']) if (k.startswith('car') or k.startswith('motorcycle') or k.startswith('vehicle')) else set(['person']) for k in self.classlist()}        
        self._class_to_shortlabel = pycollector.label.pip_to_shortlabel  # FIXME: remove dependency here

        if pretrained:
            self._load_pretrained()
            self.net.fc = nn.Linear(self.net.fc.in_features, self.num_classes())
        elif modelfile is not None:
            self._load_trained(modelfile)
        
    def category(self, x):
        yh = self.forward(x if x.ndim == 5 else torch.unsqueeze(x, 0))
        return [self.index_to_class(int(k)) for (c,k) in zip(*torch.max(yh, dim=1))]

    def category_confidence(self, x):
        yh = self.forward(x if x.ndim == 5 else torch.unsqueeze(x, 0))
        return [(self.index_to_class(int(k)), float(c)) for (c,k) in zip(*torch.max(yh, dim=1))]

    def topk(self, x_logits, k):
        yh = x_logits.detach().cpu().numpy()
        topk = [[(self.index_to_class(j), s[j]) for j in i[-k:][::-1]] for (s,i) in zip(yh, np.argsort(yh, axis=1))]
        return topk

    def topk_probability(self, x_logits, k):
        yh = x_logits.detach().cpu().numpy()
        yh_prob = F.softmax(x_logits, dim=1).detach().cpu().numpy()
        topk = [[(self.index_to_class(j), c[j], p[j]) for j in i[-k:][::-1]] for (c,p,i) in zip(yh, yh_prob, np.argsort(yh, axis=1))]
        return topk
        
    # ---- <LIGHTNING>
    def forward(self, x):
        return self.net(x)  # lighting handles device

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, batch, batch_nb, logging=True, valstep=False):
        (x,Y) = batch  
        y_hat = self.forward(x)
        y_hat_softmax = F.softmax(y_hat, dim=1)

        (loss, n_valid, y_validation) = (0, 0, [])
        #C = torch.tensor([self._index_to_training_weight[v] for (k,v) in sorted(self._class_to_index.items(), key=lambda x: x[1])], device=y_hat.device)  # inverse class frequency        
        C = torch.tensor([v for (k,v) in sorted(self._index_to_training_weight.items(), key=lambda x: x[0])], device=y_hat.device)  # inverse class frequency        
        for (yh, yhs, labelstr) in zip(y_hat, y_hat_softmax, Y):
            labels = json.loads(labelstr)
            if labels is None:
                continue  # skip me
            lbllist = [l for lbl in labels for l in set(lbl)]  # list of multi-labels within clip (unpack from JSON to use default collate_fn)
            lbllist = [l for l in lbllist if l in self._class_to_index]  # only allowable classes
            lbl_frequency = vipy.util.countby(lbllist, lambda x: x)  # frequency within clip
            lbl_weight = {k:v/float(len(lbllist)) for (k,v) in lbl_frequency.items()}  # multi-label likelihood within clip, sums to one            
            for (y,w) in lbl_weight.items():
                if valstep:
                    # Pick all labels normalized (https://papers.nips.cc/paper/2019/file/da647c549dde572c2c5edc4f5bef039c-Paper.pdf
                    loss += float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                elif self._mlfl and not self._bgbce:
                    # Pick all labels normalized, with multi-label focal loss
                    loss += torch.min(torch.tensor(1.0, device=y_hat.device), ((w-yhs[self._class_to_index[y]])/w)**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                elif self._mlfl and self._bgbce:
                    if self._index_to_class[self._class_to_index[y]].endswith('_moves'):  # background class (FIXME)
                        loss += float(self._index_to_training_weight[self._class_to_index[y]])*float(w)*F.binary_cross_entropy_with_logits(torch.unsqueeze(yh, dim=0), torch.zeros_like(torch.unsqueeze(yh, dim=0))) # background regularization (all zeros)
                    else:
                        loss += torch.min(torch.tensor(1.0, device=y_hat.device), ((w-yhs[self._class_to_index[y]])/w)**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                elif self._mlbl:
                    # Pick all labels normalized with multi-label background loss
                    #j_bg_person = self._class_to_index['person'] if 'person' in self._class_to_index else self._class_to_index['person_walks']  # FIXME: does not generalize
                    #j_bg_vehicle = self._class_to_index['vehicle'] if 'vehicle' in self._class_to_index else self._class_to_index['car_moves']  # FIXME: does not generalize
                    #j = j_bg_person if (y.startswith('person') or y.startswith('hand')) else j_bg_vehicle
                    #loss += ((1-torch.sqrt(yhs[j]*yhs[self._class_to_index[y]]))**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                    raise ('Deprecated')                    
                else:
                    # Pick all labels normalized: https://papers.nips.cc/paper/2019/file/da647c549dde572c2c5edc4f5bef039c-Paper.pdf
                    loss += float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)

            n_valid += 1
            if len(lbllist) > 0:
                y_validation.append( (yh, self._class_to_index[max(lbllist, key=lbllist.count)]) )  # most frequent label in clip
        loss = loss / float(max(1, n_valid))  # batch reduction: mean

        if logging:
            self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss if not valstep else {'loss': loss, 'logit': torch.stack([x for (x,c) in y_validation]), 'classindex': [c for (x,c) in y_validation]}

    def validation_step(self, batch, batch_nb):
        loss = self.training_step(batch, batch_nb, logging=False, valstep=True)['loss']
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
        return {'val_loss': loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        self.log('val_loss', avg_loss, on_epoch=True, prog_bar=False, logger=True)
        self.log('avg_val_loss', avg_loss, on_epoch=True, prog_bar=True, logger=True)                
        return {'val_loss': avg_loss, 'avg_val_loss': avg_loss}                         
    #---- </LIGHTNING>
    
    @classmethod
    def from_checkpoint(cls, checkpointpath):
        return cls().load_from_checkpoint(checkpointpath)  # lightning
            
    def _load_trained(self, ckptfile):
        self.net = heyvi.model.ResNets_3D_PyTorch.resnet.generate_model(50, n_classes=self.num_classes(), unitnorm=self._unitnorm)
        t = torch.split(self.net.conv1.weight.data, dim=1, split_size_or_sections=1)
        self.net.conv1.weight.data = torch.cat( (*t, t[-1]), dim=1).contiguous()
        self.net.conv1.in_channels = 4  # inflate RGB -> RGBA
        self.load_state_dict(torch.load(ckptfile)['state_dict'])  # FIXME
        self.eval()
        return self
        
    def _load_pretrained(self):

        pthfile = vipy.downloader.downloadif('https://dl.dropboxusercontent.com/s/t3xge6lrfqpklr0/r3d50_kms_200ep.pth',
                                                vipy.util.tocache('r3d50_KMS_200ep.pth'),  # set VIPY_CACHE env 
                                                sha1='39ea626355308d8f75307cab047a8d75862c3261')
        
        net = heyvi.model.ResNets_3D_PyTorch.resnet.generate_model(50, n_classes=1139, unitnorm=self._unitnorm)
        pretrain = torch.load(pthfile, map_location='cpu')
        net.load_state_dict(pretrain['state_dict'])

        # Inflate RGB -> RGBA         
        t = torch.split(net.conv1.weight.data, dim=1, split_size_or_sections=1)
        net.conv1.weight.data = torch.cat( (*t, t[-1]), dim=1).contiguous()
        net.conv1.in_channels = 4

        self.net = net

        return self

    @staticmethod
    def _totensor(v, training, validation, input_size, num_frames, mean, std, noflip=None, show=False, doflip=False):
        assert isinstance(v, vipy.video.Scene), "Invalid input"
        
        try:
            v = v.download() if (not v.isloaded() and not v.hasfilename() and v.hasurl()) else v  # fetch it if necessary, but do not do this during training!        
            if training or validation:
                random.seed()  # force randomness after fork() 
                (ai,aj) = (v.primary_activity().startframe(), v.primary_activity().endframe())  # activity (start,end)
                (ti,tj) = (v.actor().startframe(), v.actor().endframe())  # track (start,end) 
                startframe = random.randint(max(0, ti-(num_frames//2)), max(1, tj-(num_frames//2)))  # random startframe that contains track
                endframe = min((startframe+num_frames), aj)  # endframe truncated to be end of activity
                (startframe, endframe) = (startframe, endframe) if (startframe < endframe) else (max(0, aj-num_frames), aj)  # fallback
                assert endframe - startframe <= num_frames
                vc = v.clone().clip(startframe, endframe)    # may fail for some short clips
                vc = vc.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)   
                vc = vc.fliplr() if (doflip or (random.random() > 0.5)) and (noflip is None or vc.category() not in noflip) else vc
            else:
                vc = v.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)  # TESTING: this may introduce a preview()
                vc = vc.fliplr() if doflip and (noflip is None or vc.category() not in noflip) else vc
                
            if show:
                vc.clone().resize(512,512).show(timestamp=True)
                vc.clone().binarymask().frame(0).rgb().show(figure='binary mask: frame 0')
                
            vc = vc.load(shape=(input_size, input_size, 3)).normalize(mean=mean, std=std, scale=1.0/255.0)  # [0,255] -> [0,1], triggers load() with known shape
            (t,lbl) = vc.torch(startframe=0, length=num_frames, boundary='cyclic', order='cdhw', withlabel=training or validation, nonelabel=True)  # (c=3)x(d=num_frames)x(H=input_size)x(W=input_size), reuses vc._array
            t = torch.cat((t, vc.asfloatmask(fg=0.5, bg=-0.5).torch(startframe=0, length=num_frames, boundary='cyclic', order='cdhw')), dim=0)  # (c=4) x (d=num_frames) x (H=input_size) x (W=input_size), copy
            
        except Exception as e:
            if training or validation:
                #print('ERROR: %s' % (str(v)))
                t = torch.zeros(4, num_frames, input_size, input_size)  # skip me
                lbl = None
            else:
                print('WARNING: discarding tensor for video "%s" with exception "%s"' % (str(v), str(e)))
                t = torch.zeros(4, num_frames, input_size, input_size)  # skip me (should never get here)
            
        if training or validation:
            return (t, json.dumps(lbl))  # json to use default collate_fn
        else:
            return t

    def totensor(self, v=None, training=False, validation=False, show=False, doflip=False):
        """Return captured lambda function if v=None, else return tensor"""    
        assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
        f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show:
             PIP_250k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right'], show=show, doflip=doflip))
        return f(v) if v is not None else f

Ancestors

  • pytorch_lightning.core.lightning.LightningModule
  • abc.ABC
  • pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
  • pytorch_lightning.core.grads.GradInformation
  • pytorch_lightning.core.saving.ModelIO
  • pytorch_lightning.core.hooks.ModelHooks
  • pytorch_lightning.core.hooks.DataHooks
  • pytorch_lightning.core.hooks.CheckpointHooks
  • torch.nn.modules.module.Module
  • ActivityRecognition

Subclasses

Class variables

var dump_patches : bool
var training : bool

Static methods

def from_checkpoint(checkpointpath)
Expand source code Browse git
@classmethod
def from_checkpoint(cls, checkpointpath):
    return cls().load_from_checkpoint(checkpointpath)  # lightning

Methods

def category(self, x)
Expand source code Browse git
def category(self, x):
    yh = self.forward(x if x.ndim == 5 else torch.unsqueeze(x, 0))
    return [self.index_to_class(int(k)) for (c,k) in zip(*torch.max(yh, dim=1))]
def category_confidence(self, x)
Expand source code Browse git
def category_confidence(self, x):
    yh = self.forward(x if x.ndim == 5 else torch.unsqueeze(x, 0))
    return [(self.index_to_class(int(k)), float(c)) for (c,k) in zip(*torch.max(yh, dim=1))]
def configure_optimizers(self)

Choose what optimizers and learning-rate schedulers to use in your optimization. Normally you'd need one. But in the case of GANs or similar you might have multiple.

Return

Any of these 6 options.

  • Single optimizer.
  • List or Tuple - List of optimizers.
  • Two lists - The first list has multiple optimizers, the second a list of LR schedulers (or lr_dict).
  • Dictionary, with an 'optimizer' key, and (optionally) a 'lr_scheduler' key whose value is a single LR scheduler or lr_dict.
  • Tuple of dictionaries as described, with an optional 'frequency' key.
  • None - Fit will run without any optimizer.

Note

The 'frequency' value is an int corresponding to the number of sequential batches optimized with the specific optimizer. It should be given to none or to all of the optimizers. There is a difference between passing multiple optimizers in a list, and passing multiple optimizers in dictionaries with a frequency of 1: In the former case, all optimizers will operate on the given batch in each optimization step. In the latter, only one optimizer will operate on the given batch at every step.

The lr_dict is a dictionary which contains the scheduler and its associated configuration. The default configuration is shown below.

.. code-block:: python

{
    'scheduler': lr_scheduler, # The LR scheduler instance (required)
    'interval': 'epoch', # The unit of the scheduler's step size
    'frequency': 1, # The frequency of the scheduler
    'reduce_on_plateau': False, # For ReduceLROnPlateau scheduler
    'monitor': 'val_loss', # Metric for ReduceLROnPlateau to monitor
    'strict': True, # Whether to crash the training if <code>monitor</code> is not found
    'name': None, # Custom name for LearningRateMonitor to use
}

Only the scheduler key is required, the rest will be set to the defaults above.

Examples

.. code-block:: python

# most cases
def configure_optimizers(self):
    opt = Adam(self.parameters(), lr=1e-3)
    return opt

# multiple optimizer case (e.g.: GAN)
def configure_optimizers(self):
    generator_opt = Adam(self.model_gen.parameters(), lr=0.01)
    disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02)
    return generator_opt, disriminator_opt

# example with learning rate schedulers
def configure_optimizers(self):
    generator_opt = Adam(self.model_gen.parameters(), lr=0.01)
    disriminator_opt = Adam(self.model_disc.parameters(), lr=0.02)
    discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10)
    return [generator_opt, disriminator_opt], [discriminator_sched]

# example with step-based learning rate schedulers
def configure_optimizers(self):
    gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
    dis_opt = Adam(self.model_disc.parameters(), lr=0.02)
    gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99),
                 'interval': 'step'}  # called after each training step
    dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called every epoch
    return [gen_opt, dis_opt], [gen_sched, dis_sched]

# example with optimizer frequencies
# see training procedure in <code>Improved Training of Wasserstein GANs</code>, Algorithm 1
# <https://arxiv.org/abs/1704.00028>
def configure_optimizers(self):
    gen_opt = Adam(self.model_gen.parameters(), lr=0.01)
    dis_opt = Adam(self.model_disc.parameters(), lr=0.02)
    n_critic = 5
    return (
        {'optimizer': dis_opt, 'frequency': n_critic},
        {'optimizer': gen_opt, 'frequency': 1}
    )

Note

Some things to know:

  • Lightning calls .backward() and .step() on each optimizer and learning rate scheduler as needed.

  • If you use 16-bit precision (precision=16), Lightning will automatically handle the optimizers for you.

  • If you use multiple optimizers, :meth:training_step will have an additional optimizer_idx parameter.

  • If you use LBFGS Lightning handles the closure function automatically for you.

  • If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer at each training step.

  • If you need to control how often those optimizers step or override the default .step() schedule, override the :meth:optimizer_step hook.

  • If you only want to call a learning rate scheduler every x step or epoch, or want to monitor a custom metric, you can specify these in a lr_dict:

.. code-block:: python

  {
      'scheduler': lr_scheduler,
      'interval': 'step',  # or 'epoch'
      'monitor': 'val_f1',
      'frequency': x,
  }
Expand source code Browse git
def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
    return optimizer
def forward(self, x) ‑> Callable[..., Any]

Same as :meth:torch.nn.Module.forward(), however in Lightning you want this to define the operations you want to use for prediction (i.e.: on a server or as a feature extractor).

Normally you'd call self() from your :meth:training_step method. This makes it easy to write a complex system for training with the outputs you'd want in a prediction setting.

You may also find the :func:~pytorch_lightning.core.decorators.auto_move_data decorator useful when using the module outside Lightning in a production setting.

Args

*args
Whatever you decide to pass into the forward method.
**kwargs
Keyword arguments are also possible.

Return

Predicted output

Examples

.. code-block:: python

# example if we were using this model as a feature extractor
def forward(self, x):
    feature_maps = self.convnet(x)
    return feature_maps

def training_step(self, batch, batch_idx):
    x, y = batch
    feature_maps = self(x)
    logits = self.classifier(feature_maps)

    # ...
    return loss

# splitting it this way allows model to be used a feature extractor
model = MyModelAbove()

inputs = server.get_request()
results = model(inputs)
server.write_results(results)

# -------------
# This is in stark contrast to torch.nn.Module where normally you would have this:
def forward(self, batch):
    x, y = batch
    feature_maps = self.convnet(x)
    logits = self.classifier(feature_maps)
    return logits
Expand source code Browse git
def forward(self, x):
    return self.net(x)  # lighting handles device
def topk(self, x_logits, k)
Expand source code Browse git
def topk(self, x_logits, k):
    yh = x_logits.detach().cpu().numpy()
    topk = [[(self.index_to_class(j), s[j]) for j in i[-k:][::-1]] for (s,i) in zip(yh, np.argsort(yh, axis=1))]
    return topk
def topk_probability(self, x_logits, k)
Expand source code Browse git
def topk_probability(self, x_logits, k):
    yh = x_logits.detach().cpu().numpy()
    yh_prob = F.softmax(x_logits, dim=1).detach().cpu().numpy()
    topk = [[(self.index_to_class(j), c[j], p[j]) for j in i[-k:][::-1]] for (c,p,i) in zip(yh, yh_prob, np.argsort(yh, axis=1))]
    return topk
def totensor(self, v=None, training=False, validation=False, show=False, doflip=False)

Return captured lambda function if v=None, else return tensor

Expand source code Browse git
def totensor(self, v=None, training=False, validation=False, show=False, doflip=False):
    """Return captured lambda function if v=None, else return tensor"""    
    assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
    f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show:
         PIP_250k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right'], show=show, doflip=doflip))
    return f(v) if v is not None else f
def training_step(self, batch, batch_nb, logging=True, valstep=False)

Here you compute and return the training loss and some additional metrics for e.g. the progress bar or logger.

Args

batch (:class:~torch.Tensor | (:class:~torch.Tensor, …) | [:class:~torch.Tensor, …]):
The output of your :class:~torch.utils.data.DataLoader. A tensor, tuple or list.
batch_idx : int
Integer displaying index of this batch
optimizer_idx : int
When using multiple optimizers, this argument will also be present.

hiddens(:class:~torch.Tensor): Passed in if :paramref:~pytorch_lightning.trainer.trainer.Trainer.truncated_bptt_steps > 0.

Return

Any of.

  • :class:~torch.Tensor - The loss tensor
  • dict - A dictionary. Can include any keys, but must include the key 'loss'
  • None - Training will skip to the next batch

In this step you'd normally do the forward pass and calculate the loss for a batch. You can also do fancier things like multiple forward passes or something model specific.

Example::

def training_step(self, batch, batch_idx):
    x, y, z = batch
    out = self.encoder(x)
    loss = self.loss(out, x)
    return loss

If you define multiple optimizers, this step will be called with an additional optimizer_idx parameter.

.. code-block:: python

# Multiple optimizers (e.g.: GANs)
def training_step(self, batch, batch_idx, optimizer_idx):
    if optimizer_idx == 0:
        # do training_step with encoder
    if optimizer_idx == 1:
        # do training_step with decoder

If you add truncated back propagation through time you will also get an additional argument with the hidden states of the previous step.

.. code-block:: python

# Truncated back-propagation through time
def training_step(self, batch, batch_idx, hiddens):
    # hiddens are the hidden states from the previous truncated backprop step
    ...
    out, hiddens = self.lstm(data, hiddens)
    ...
    return {'loss': loss, 'hiddens': hiddens}

Note

The loss value shown in the progress bar is smoothed (averaged) over the last values, so it differs from the actual loss returned in train/validation step.

Expand source code Browse git
def training_step(self, batch, batch_nb, logging=True, valstep=False):
    (x,Y) = batch  
    y_hat = self.forward(x)
    y_hat_softmax = F.softmax(y_hat, dim=1)

    (loss, n_valid, y_validation) = (0, 0, [])
    #C = torch.tensor([self._index_to_training_weight[v] for (k,v) in sorted(self._class_to_index.items(), key=lambda x: x[1])], device=y_hat.device)  # inverse class frequency        
    C = torch.tensor([v for (k,v) in sorted(self._index_to_training_weight.items(), key=lambda x: x[0])], device=y_hat.device)  # inverse class frequency        
    for (yh, yhs, labelstr) in zip(y_hat, y_hat_softmax, Y):
        labels = json.loads(labelstr)
        if labels is None:
            continue  # skip me
        lbllist = [l for lbl in labels for l in set(lbl)]  # list of multi-labels within clip (unpack from JSON to use default collate_fn)
        lbllist = [l for l in lbllist if l in self._class_to_index]  # only allowable classes
        lbl_frequency = vipy.util.countby(lbllist, lambda x: x)  # frequency within clip
        lbl_weight = {k:v/float(len(lbllist)) for (k,v) in lbl_frequency.items()}  # multi-label likelihood within clip, sums to one            
        for (y,w) in lbl_weight.items():
            if valstep:
                # Pick all labels normalized (https://papers.nips.cc/paper/2019/file/da647c549dde572c2c5edc4f5bef039c-Paper.pdf
                loss += float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
            elif self._mlfl and not self._bgbce:
                # Pick all labels normalized, with multi-label focal loss
                loss += torch.min(torch.tensor(1.0, device=y_hat.device), ((w-yhs[self._class_to_index[y]])/w)**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
            elif self._mlfl and self._bgbce:
                if self._index_to_class[self._class_to_index[y]].endswith('_moves'):  # background class (FIXME)
                    loss += float(self._index_to_training_weight[self._class_to_index[y]])*float(w)*F.binary_cross_entropy_with_logits(torch.unsqueeze(yh, dim=0), torch.zeros_like(torch.unsqueeze(yh, dim=0))) # background regularization (all zeros)
                else:
                    loss += torch.min(torch.tensor(1.0, device=y_hat.device), ((w-yhs[self._class_to_index[y]])/w)**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
            elif self._mlbl:
                # Pick all labels normalized with multi-label background loss
                #j_bg_person = self._class_to_index['person'] if 'person' in self._class_to_index else self._class_to_index['person_walks']  # FIXME: does not generalize
                #j_bg_vehicle = self._class_to_index['vehicle'] if 'vehicle' in self._class_to_index else self._class_to_index['car_moves']  # FIXME: does not generalize
                #j = j_bg_person if (y.startswith('person') or y.startswith('hand')) else j_bg_vehicle
                #loss += ((1-torch.sqrt(yhs[j]*yhs[self._class_to_index[y]]))**2)*float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)
                raise ('Deprecated')                    
            else:
                # Pick all labels normalized: https://papers.nips.cc/paper/2019/file/da647c549dde572c2c5edc4f5bef039c-Paper.pdf
                loss += float(w)*F.cross_entropy(torch.unsqueeze(yh, dim=0), torch.tensor([self._class_to_index[y]], device=y_hat.device), weight=C)

        n_valid += 1
        if len(lbllist) > 0:
            y_validation.append( (yh, self._class_to_index[max(lbllist, key=lbllist.count)]) )  # most frequent label in clip
    loss = loss / float(max(1, n_valid))  # batch reduction: mean

    if logging:
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return loss if not valstep else {'loss': loss, 'logit': torch.stack([x for (x,c) in y_validation]), 'classindex': [c for (x,c) in y_validation]}
def validation_epoch_end(self, outputs)

Called at the end of the validation epoch with the outputs of all validation steps.

.. code-block:: python

# the pseudocode for these calls
val_outs = []
for val_batch in val_data:
    out = validation_step(val_batch)
    val_outs.append(out)
validation_epoch_end(val_outs)

Args

outputs
List of outputs you defined in :meth:validation_step, or if there are multiple dataloaders, a list containing a list of outputs for each dataloader.

Return

None

Note

If you didn't define a :meth:validation_step, this won't be called.

Examples

With a single dataloader:

.. code-block:: python

def validation_epoch_end(self, val_step_outputs):
    for out in val_step_outputs:
        # do something

With multiple dataloaders, outputs will be a list of lists. The outer list contains one entry per dataloader, while the inner list contains the individual outputs of each validation step for that dataloader.

.. code-block:: python

def validation_epoch_end(self, outputs):
    for dataloader_output_result in outputs:
        dataloader_outs = dataloader_output_result.dataloader_i_outputs

    self.log('final_metric', final_value)
Expand source code Browse git
def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    self.log('val_loss', avg_loss, on_epoch=True, prog_bar=False, logger=True)
    self.log('avg_val_loss', avg_loss, on_epoch=True, prog_bar=True, logger=True)                
    return {'val_loss': avg_loss, 'avg_val_loss': avg_loss}                         
def validation_step(self, batch, batch_nb)

Operates on a single batch of data from the validation set. In this step you'd might generate examples or calculate anything of interest like accuracy.

.. code-block:: python

# the pseudocode for these calls
val_outs = []
for val_batch in val_data:
    out = validation_step(val_batch)
    val_outs.append(out)
    validation_epoch_end(val_outs)

Args

batch (:class:~torch.Tensor | (:class:~torch.Tensor, …) | [:class:~torch.Tensor, …]):
The output of your :class:~torch.utils.data.DataLoader. A tensor, tuple or list.
batch_idx : int
The index of this batch
dataloader_idx : int
The index of the dataloader that produced this batch (only if multiple val datasets used)

Return

Any of.

  • Any object or value
  • None - Validation will skip to the next batch

.. code-block:: python

# pseudocode of order
out = validation_step()
if defined('validation_step_end'):
    out = validation_step_end(out)
out = validation_epoch_end(out)

.. code-block:: python

# if you have one val dataloader:
def validation_step(self, batch, batch_idx)

# if you have multiple val dataloaders:
def validation_step(self, batch, batch_idx, dataloader_idx)

Examples

.. code-block:: python

# CASE 1: A single validation dataset
def validation_step(self, batch, batch_idx):
    x, y = batch

    # implement your own
    out = self(x)
    loss = self.loss(out, y)

    # log 6 example images
    # or generated text... or whatever
    sample_imgs = x[:6]
    grid = torchvision.utils.make_grid(sample_imgs)
    self.logger.experiment.add_image('example_images', grid, 0)

    # calculate acc
    labels_hat = torch.argmax(out, dim=1)
    val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)

    # log the outputs!
    self.log_dict({'val_loss': loss, 'val_acc': val_acc})

If you pass in multiple val datasets, validation_step will have an additional argument.

.. code-block:: python

# CASE 2: multiple validation datasets
def validation_step(self, batch, batch_idx, dataloader_idx):
    # dataloader_idx tells you which dataset this is.

Note

If you don't need to validate you don't need to implement this method.

Note

When the :meth:validation_step is called, the model has been put in eval mode and PyTorch gradients have been disabled. At the end of validation, the model goes back to training mode and gradients are enabled.

Expand source code Browse git
def validation_step(self, batch, batch_nb):
    loss = self.training_step(batch, batch_nb, logging=False, valstep=True)['loss']
    self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
    return {'val_loss': loss}
class PIP_370k (pretrained=True, deterministic=False, modelfile=None, mlbl=False, mlfl=True, unitnorm=False, bgbce=False)

Activity recognition using people in public - 250k stabilized

Expand source code Browse git
class PIP_370k(PIP_250k, pl.LightningModule, ActivityRecognition):

    def __init__(self, pretrained=True, deterministic=False, modelfile=None, mlbl=False, mlfl=True, unitnorm=False, bgbce=False):
        pl.LightningModule.__init__(self)
        ActivityRecognition.__init__(self)  

        self._input_size = 112
        self._num_frames = 16        
        self._mean = [0.485, 0.456, 0.406]
        self._std = [0.229, 0.224, 0.225]
        self._mlfl = mlfl
        self._mlbl = mlbl
        self._calibrated = False
        self._calibrated_constant = None  # -1.5
        self._bgbce = bgbce
        self._unitnorm = unitnorm

        if deterministic:
            np.random.seed(42)
        
        # Generated using vipy.dataset.Dataset(...).multilabel_inverse_frequency_weight()
        self._class_to_training_weight = {'car_drops_off_person': 0.7858124882763793, 'car_moves': 0.18439798528529147, 'car_picks_up_person': 0.7380666753394193, 'car_reverses': 0.5753369570213479, 'car_starts': 0.47486292483745757, 'car_stops': 0.44244800737774037, 'car_turns_left': 0.7697107319736983, 'car_turns_right': 0.5412936796835607, 'hand_interacts_with_person': 0.2794031245117859, 'person_abandons_package': 1.0789960714517162, 'person_carries_heavy_object': 0.5032333530901552, 'person_closes_car_door': 0.46460114438995603, 'person_closes_car_trunk': 0.6824201392305784, 'person_closes_facility_door': 0.38990434394080076, 'person_embraces_person': 0.6457437695527715, 'person_enters_car': 0.6934926810021877, 'person_enters_scene_through_structure': 0.2586965095740063, 'person_exits_car': 0.6766386632434479, 'person_exits_scene_through_structure': 0.33054895987676847, 'person_interacts_with_laptop': 0.6720176496986436, 'person_loads_car': 0.6880555743488312, 'person_opens_car_door': 0.4069868136393968, 'person_opens_car_trunk': 0.6911966903970317, 'person_opens_facility_door': 0.3018924474724252, 'person_picks_up_object': 0.4298381074082487, 'person_purchases_from_cashier': 5.479834409621331, 'person_purchases_from_machine': 5.31528236654537, 'person_puts_down_object': 0.2804690906037155, 'person_reads_document': 0.5476186269530937, 'person_rides_bicycle': 1.6090962879286763, 'person_sits_down': 0.4750148103149501, 'person_stands_up': 0.5022364750834624, 'person_steals_object': 0.910991409921711, 'person_talks_on_phone': 0.15771902851484076, 'person_talks_to_person': 0.21362675034201736, 'person_texts_on_phone': 0.3328378404741194, 'person_transfers_object_to_car': 2.964890512157848, 'person_transfers_object_to_person': 0.6481292773603928, 'person_unloads_car': 0.515379337544623, 'person_walks': 6.341278284010202}
        self._class_to_weight = self._class_to_training_weight  # backwards compatibility

        # Generated using vipy.dataset.Dataset(...).class_to_index()
        self._class_to_index = {'car_drops_off_person': 0, 'car_moves': 1, 'car_picks_up_person': 2, 'car_reverses': 3, 'car_starts': 4, 'car_stops': 5, 'car_turns_left': 6, 'car_turns_right': 7, 'hand_interacts_with_person': 8, 'person_abandons_package': 9, 'person_carries_heavy_object': 10, 'person_closes_car_door': 11, 'person_closes_car_trunk': 12, 'person_closes_facility_door': 13, 'person_embraces_person': 14, 'person_enters_car': 15, 'person_enters_scene_through_structure': 16, 'person_exits_car': 17, 'person_exits_scene_through_structure': 18, 'person_interacts_with_laptop': 19, 'person_loads_car': 20, 'person_opens_car_door': 21, 'person_opens_car_trunk': 22, 'person_opens_facility_door': 23, 'person_picks_up_object': 24, 'person_purchases_from_cashier': 25, 'person_purchases_from_machine': 26, 'person_puts_down_object': 27, 'person_reads_document': 28, 'person_rides_bicycle': 29, 'person_sits_down': 30, 'person_stands_up': 31, 'person_steals_object': 32, 'person_talks_on_phone': 33, 'person_talks_to_person': 34, 'person_texts_on_phone': 35, 'person_transfers_object_to_car': 36, 'person_transfers_object_to_person': 37, 'person_unloads_car': 38, 'person_walks': 39}
        
        self._index_to_training_weight = {self._class_to_index[k]:v for (k,v) in self._class_to_weight.items()}

        self._verb_to_noun = {k:set(['car','vehicle','motorcycle','bus','truck']) if (k.startswith('car') or k.startswith('motorcycle') or k.startswith('vehicle')) else set(['person']) for k in self.classlist()}        
        self._class_to_shortlabel = heyvi.label.pip_to_shortlabel
        self._class_to_shortlabel.update( vipy.data.meva.d_category_to_shortlabel )

        if pretrained:
            self._load_pretrained()
            self.net.fc = nn.Linear(self.net.fc.in_features, self.num_classes())
        elif modelfile is not None:
            self._load_trained(modelfile)

    def topk(self, x, k=None):
        """Return the top-k classes for a 3 second activity proposal along with framewise ground truth"""        
        yh = self.forward(x if x.ndim == 5 else x.unsqueeze(0)).detach().cpu().numpy()
        k = k if k is not None else self.num_classes()
        return [ [self.index_to_class(int(j)) for j in i[-k:][::-1]] for (s,i) in zip(yh, np.argsort(yh, axis=1))]
            
    @staticmethod
    def _totensor(v, training, validation, input_size, num_frames, mean, std, noflip=None, show=False, doflip=False, stride_jitter=3, asjson=False, classname='heyvi.recognition.PIP_370k'):
        assert isinstance(v, vipy.video.Scene), "Invalid input"
        
        try:
            v = v.download() if (not v.isloaded() and v.hasurl() and not v.hasfilename()) else v  # fetch it if necessary, but do not do this during training!        
            if training or validation:
                vc = v.clone()  # these frames are shared across all tracks
                random.seed()  # force randomness after fork() 
                (clipstart, clipend) = vc.cliprange()  # clip (start, end) relative to video 
                (clipstart, clipend) = (clipstart if clipstart is not None else 0,   
                                        clipend if clipend is not None else int(np.floor(v.duration_in_frames_of_videofile() * (vc.framerate() / v.framerate_of_videofile()))))  # (yuck)
                # WARNINGS: 
                # - There exist videos with tracks outside the image rectangle due to the padding in stabilization.  
                # - There exist MEVA videos that have no tracks at the beginning and end of the padded clip since the annotations only exist for the activity
                # - There exist MEVA videos with activities that are longer than the tracks, if so, keep the interval of the activity that contains the track
                # - There exist MEVA videos with multiple objects, need to include only primary actor
                
                # - turning activities may be outside the frame (filter these)
                # - turning activities may turn into the stabilized black area.  Is this avoidaable?
                # - all of the training activities should be centered on the activity.  See if not.
                
                if (clipend - clipstart) > (num_frames + stride_jitter):
                    a = vc.primary_activity().clone().padto(num_frames/float(vc.framerate()))  # for context only, may be past end of clip now!
                    (ai, aj) = (a.startframe(), a.endframe())  # activity (start,end) relative to (clipstart, clipend)
                    (ai, aj) = (max(ai, vc.actor().startframe()), min(aj, vc.actor().endframe()))  # clip activity to when actor is present
                    startframe = random.randint(ai, aj-num_frames-1) if aj-num_frames-1 > ai else ai
                    startframe = max(0, startframe + random.randint(-stride_jitter, stride_jitter))   # +/- 3 frames jitter for activity stride
                    endframe = min(clipend-clipstart-1, startframe + num_frames)  # new end cannot be past duration of clip
                    if (endframe > startframe) and ((endframe - startframe) < (clipend - clipstart)):
                        vc = vc.clip(startframe, endframe)
                    else: 
                        raise ValueError('invalid clip for "%s"' % str(v))
                vc = vc.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)   
                vc = vc.fliplr() if (doflip or (random.random() > 0.5)) and (noflip is None or vc.category() not in noflip) else vc
            else:
                vc = v.clone(shallow=True)  # shallow clone, trackcrop() will result in new array and v will be preserved
                vc = vc.trackcrop(dilate=1.2, maxsquare=True)  # may be None if clip contains no track
                vc = vc.resize(input_size, input_size)  # This may introduce a preview()
                vc = vc.fliplr() if doflip and (noflip is None or vc.category() not in noflip) else vc
                
            if show:
                vc.clone().resize(512,512).show(timestamp=True)
                vc.clone().binarymask().frame(0).gain(255).rgb().show(figure='binary mask: frame 0')
                
            vc = vc.load(shape=(input_size, input_size, 3)).normalize(mean=mean, std=std, scale=1.0/255.0)  # [0,255] -> [0,1], triggers load() with known shape
            (t,lbl) = vc.torch(startframe=0, length=num_frames, boundary='repeat', order='cdhw', withlabel=training or validation, nonelabel=True)  # (c=3)x(d=num_frames)x(H=input_size)x(W=input_size), reuses vc._array
            t = torch.cat((t, vc.asfloatmask(fg=0.5, bg=-0.5).torch(startframe=0, length=num_frames, boundary='repeat', order='cdhw')), dim=0)  # (c=4) x (d=num_frames) x (H=input_size) x (W=input_size), copy

        except Exception as e:
            if training or validation:
                print('[heyvi.recognition.%s._totensor][SKIPPING]: video="%s", exception="%s"' % (classname, str(vc), str(e)))
                (t, lbl) = (torch.zeros(4, num_frames, input_size, input_size), None)  # must always return conformal tensor (label=None means it will be ignored)
            else:
                print('[heyvi.recognition.%s._totensor][ERROR]: discarding tensor for video "%s" with exception "%s"' % (classname, str(vc), str(e)))
                #t = torch.zeros(4, num_frames, input_size, input_size)  # skip me (should never get here)
                raise

        if training or validation:
            return (t, json.dumps(lbl) if not asjson else lbl)  # json to use default torch collate_fn
        else:
            return t

    def totensor(self, v=None, training=False, validation=False, show=False, doflip=False, asjson=False):
        """Return captured lambda function if v=None, else return tensor"""
        assert v is None or isinstance(v, vipy.video.Scene), "Invalid input"
        f = (lambda v, num_frames=self._num_frames, input_size=self._input_size, mean=self._mean, std=self._std, training=training, validation=validation, show=show, classname=self.__class__.__name__:
             PIP_370k._totensor(v, training, validation, input_size, num_frames, mean, std, noflip=['car_turns_left', 'car_turns_right', 'vehicle_turns_left', 'vehicle_turns_right', 'motorcycle_turns_left', 'motorcycle_turns_right'], show=show, doflip=doflip, asjson=asjson, classname=classname))
        return f(v) if v is not None else f

Ancestors

  • PIP_250k
  • pytorch_lightning.core.lightning.LightningModule
  • abc.ABC
  • pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin
  • pytorch_lightning.core.grads.GradInformation
  • pytorch_lightning.core.saving.ModelIO
  • pytorch_lightning.core.hooks.ModelHooks
  • pytorch_lightning.core.hooks.DataHooks
  • pytorch_lightning.core.hooks.CheckpointHooks
  • torch.nn.modules.module.Module
  • ActivityRecognition

Subclasses

Class variables

var dump_patches : bool
var training : bool

Methods

def topk(self, x, k=None)

Return the top-k classes for a 3 second activity proposal along with framewise ground truth

Expand source code Browse git
def topk(self, x, k=None):
    """Return the top-k classes for a 3 second activity proposal along with framewise ground truth"""        
    yh = self.forward(x if x.ndim == 5 else x.unsqueeze(0)).detach().cpu().numpy()
    k = k if k is not None else self.num_classes()
    return [ [self.index_to_class(int(j)) for j in i[-k:][::-1]] for (s,i) in zip(yh, np.argsort(yh, axis=1))]

Inherited members