detect_video.py

from __future__ import division
import time
import torch 
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import cv2 
from util import *
from DNModel import net as Darknet
from img_process import inp_to_image, custom_resize
import pandas as pd
import random 
import pickle as pkl
import argparse
import test
import serial
#from serial import Serial

def prepare_input(img, inp_dim):
    """
    Prepare image for inputting to the neural network. 
    Perform tranpose and return Tensor
    """

    orig_im = img
    dim = orig_im.shape[1], orig_im.shape[0]
    img = (custom_resize(orig_im, (inp_dim, inp_dim)))
    img_ = img[:,:,::-1].transpose((2,0,1)).copy()
    img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
    return img_, orig_im, dim

def write(x,img):
    h,w,_ = img.shape

    c1 = tuple(x[1:3].int())
    c2 = tuple(x[3:5].int())
    cls = int(x[-1])
    label = "{0}".format(classes[cls])
    color = random.choice(colors)
    th_y = 550
    cv2.line(img, (0,th_y), (w,th_y), (255,0,0), 5)
    '''
    to detect the elephant
    '''
    if  label == 'elephant':

        cv2.rectangle(img, c1, c2, (0,255,255),2)
        cv2.putText(img, label, (c1[0], c1[1]-10), 1,1, (255,255,0),2)

        if c2[1] > th_y:
            data = '1'
            #print('label')
            ser.write(data.encode())


            # '''uncomment the below code to detect the person from the camara'''
            

    # if  label == 'person':

    #     cv2.rectangle(img, c1, c2, (0,255,255),2)
    #     cv2.putText(img, label, (c1[0], c1[1]-10), 1,1, (255,255,0),2)

    #     if c2[1] < th_y:
    #         data = '0'
    #         #print('label')
    #         ser.write(data.encode())


def arg_parse():
    """
    Parse arguements to the detect module
    
    """
    
    
    parser = argparse.ArgumentParser(description='YOLO v3 Video Detection Module')
   
    parser.add_argument("--video", dest = 'video', help = 
                        "Video to run detection upon",
                        default = "shreetrim24.avi", type = str)
    parser.add_argument("--dataset", dest = "dataset", help = "Dataset on which the network has been trained", default = "pascal")
    parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5)
    parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4)
    parser.add_argument("--cfg", dest = 'cfgfile', help = 
                        "Config file",
                        default = "cfg/yolov3.cfg", type = str)
    parser.add_argument("--weights", dest = 'weightsfile', help = 
                        "weightsfile",
                        default = "yolov3.weights", type = str)
    parser.add_argument("--reso", dest = 'reso', help = 
                        "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",
                        default = "320", type = str)
    return parser.parse_args()


if __name__ == '__main__':
    args = arg_parse()
    ser = serial.Serial('/dev/ttyUSB0', 9600)
    confidence = float(args.confidence)
    nms_thesh = float(args.nms_thresh)
    start = 0
    CUDA = torch.cuda.is_available()

    num_classes = 80
    
    bbox_attrs = 5 + num_classes
    print("Loading network")
    model = Darknet(args.cfgfile)
    model.load_weights(args.weightsfile)
    print("Network loaded")
    classes = load_classes('data/coco.names')
    colors = pkl.load(open("pallete", "rb"))
    model.DNInfo["height"] = args.reso
    inp_dim = int(model.DNInfo["height"])


    if CUDA:
        model.cuda()
        
    model.eval()
    
    videofile = args.video
    
    '''
    put 0 to capture the image from the camara
 	cap = cv2.VideoCapture(0)

    '''
    cap = cv2.VideoCapture(videofile)
    
    assert cap.isOpened(), 'Cannot capture source'
    
    while cap.isOpened():
        
        ret, frame = cap.read()
        frame = cv2.resize(frame, (1240,950))
        if ret:
            

            img, orig_im, dim = prepare_input(frame, inp_dim)
            
            im_dim = torch.FloatTensor(dim).repeat(1,2)                        
            
            
            if CUDA:
                im_dim = im_dim.cuda()
                img = img.cuda()
            
            with torch.no_grad():   
                output = model(Variable(img), CUDA)
            output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh)

            if type(output) == int:
                cv2.imshow("frame", orig_im)
                key = cv2.waitKey(1)
                if key & 0xFF == ord('x'):
                    break
                continue
        
            
            im_dim = im_dim.repeat(output.size(0), 1)
            scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1)
            
            output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2
            output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2
            
            output[:,1:5] /= scaling_factor
    
            for i in range(output.shape[0]):
                output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0])
                output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1])
            

            list(map(lambda x: write(x, orig_im), output))
            
            
            cv2.imshow("Object Detect", orig_im)
            key = cv2.waitKey(1)
            if key & 0xFF == ord('x'):
                break
        else:
            break