機械学習のお勉強(Tiny Yoloを動かしてみる：予測のみ)

やりたいこと

低スペックパソコンで Tiny YOLOを使ってざっくりとした人の位置と大きさを出力する

教科書　👇

わかりやすい記事ありがとうございます。

今回扱うTinyYOLOの説明

上記でもあるように、

Pascal VOCという20クラス分類問題のデータセットに対して係数は最適化されています。

全画面を7x7のグリッドに区切り、グリッドごとのクラス確率と、そのグリッド内に中心をもつバウンディングボックスを1グリッドにつき最大2つずつ、そして、その2つのバウンディングボックス信頼度、というすべての情報を直接推論します。

バウンディングボックスの座標は中心位置のx,yをグリッドサイズで0から1.0に正規化した数値、サイズは幅と高さを画像サイズで0から1.0に正規化した数値の平方根になっています。平方根にする理由は、サイズが大きくなったときの誤差のペナルティを減らしたいからだそうです。

最終段の出力ベクトルは、
7x7x20 のグリッドごとの20クラスの確率
7x7x2 の各グリッドに２組のバウンディングボックスごとの信頼度
7x7x2x4 の各グリッドに２組のバウンディングボックス座標（0-1で正規化してある）
という順番で並んでいるので、reshapeしたあと、信頼度の高いバウンディングボックスを特定して、その座標とクラスを取り出します。

推論時はクラス確率とバウンディングボックスの信頼度を乗算してしきい値で切ればよいですが、学習時は重要なところだけロス関数に乗りやすいような工夫をするそうです。

という感じ。

モデル自体はシンプルで、使っているのは

畳み込み層
全結合層
leaky_relu（活性関数）
Max Pooling

くらい

実装(CPU)

[https://github.com/ashitani/YOLO_chainer:title] さんの実装を少しだけ変更する。ありがたや。

predict.py

import numpy as np

def predict(model,im_org):
  im0=cv2.cvtColor(im_org, cv2.COLOR_BGR2RGB)
  im_size=np.shape(im0)
  im0=cv2.resize(im0,(448,448))
  im=np.asarray(im0,dtype=np.float32)/255.0
  im=im*2.0-1.0

  ans=model.predict( im.transpose(2,0,1).reshape(1,3,448,448)).data[0]

  probs=ans[0:980].reshape((7,7,20))     # class probabilities
  confs=ans[980:1078].reshape((7,7,2))   # confidence score for Bounding Boxes
  boxes = ans[1078:].reshape((7,7,2,4))  # Bounding Boxes positions (x,y,w,h)

  p=np.zeros((7,7,2,20))
  for i in range(20):
      for j in range(2):
          p[:,:,j,i]=np.multiply(probs[:,:,i],confs[:,:,j])

  th=0.1

  im_h=im_size[0]
  im_w=im_size[1]

  im_marked=im_org

  classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
             "bus", "car", "cat", "chair", "cow",
             "diningtable", "dog", "horse", "motorbike", "person",
             "pottedplant", "sheep", "sofa", "train","tvmonitor"]

  pos_size = None
  class_label = None

  for z in np.argwhere(p>th):
      by,bx,j,i = z
      box=boxes[by,bx,j,:]
      x= (bx+box[0])*(im_w/7)
      y = (by+box[1])*(im_h/7)
      w = box[2]**2.0*im_w
      h = box[3]**2.0*im_h
      im_marked=cv2.rectangle(im_marked, (int(x-w/2), int( y-h/2)),( int(x+w/2), int(y+h/2)),[0,0,255],thickness=2)
      im_marked=cv2.rectangle(im_marked, (int(x-w/2), int( y-h/2)),( int(x-w/2+100), int(y-h/2+20)),[0,0,255],thickness=-1)
      cv2.putText(im_marked, classes[i],(int(x-w/2+5),int(y-h/2+15)), cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,0),thickness=2)

      pos_size = [x, y, box[2]**2.0, box[3]**2.0]
      class_label = classes[i]

  return im_marked, class_label, pos_size

replay.py

#!/usr/bin/env python

from YOLOtiny_chainer import *
import cv2
import sys
import os
import numpy as np

print "Loading model"
model=YOLOtiny()
serializers.load_npz('YOLOtiny_chainer/YOLOtiny.model',model)

cap = cv2.VideoCapture(0)

while(1):
    _, frame = cap.read()

    im_marked, class_label, pos_size = predict(model,frame)

    if class_label is "person":
        size = pos_size[2] * pos_size[3]  # volume of bounding box 
    print "position_x:{}".format(pos_size[0]) , "position_y:{}".format(pos_size[1])
    print "person_size:{}".format(size)

    cv2.imshow('im_marked',im_marked)
    k = cv2.waitKey(5) & 0xFF
    if k == 27:
        break

cap.release()
cv2.destroyAllWindows()

出力結果はこんな感じ　👇

bounding boxの中心位置（取得画像サイズでの横軸X,縦軸Yの値）
bounding boxの面積 (ざっくりとしたサイズ)

f:id:robonchu:20171007111844p:plain

実装(GPU)

import cv2
import numpy as np
from chainer import cuda

def predict(model, im_org, device = -1):    
  im0=cv2.cvtColor(im_org, cv2.COLOR_BGR2RGB)
  im_size=np.shape(im0)
  im0=cv2.resize(im0,(448,448))
  im=np.asarray(im0,dtype=np.float32)/255.0
  im=im*2.0-1.0

  img = im.transpose(2,0,1).reshape(1,3,448,448)

  if device == 0:
    cuda.get_device(device).use()
    model.to_gpu(device)
    img = cuda.to_gpu(img, device)
    xp = cuda.cupy
  else:
    xp = np
    
  ans=model.predict(img).data[0]

  probs=ans[0:980].reshape((7,7,20))     # class probabilities
  confs=ans[980:1078].reshape((7,7,2))   # confidence score for Bounding Boxes
  boxes = ans[1078:].reshape((7,7,2,4))  # Bounding Boxes positions (x,y,w,h)

  p=xp.zeros((7,7,2,20))
  for i in range(20):
      for j in range(2):
          p[:,:,j,i]=xp.multiply(probs[:,:,i],confs[:,:,j])

  th=0.1

  im_h=im_size[0]
  im_w=im_size[1]

  im_marked=im_org

  classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
             "bus", "car", "cat", "chair", "cow",
             "diningtable", "dog", "horse", "motorbike", "person",
             "pottedplant", "sheep", "sofa", "train","tvmonitor"]

  pos_size = None
  class_label = None

  for z in np.argwhere(p>th):
      by,bx,j,i = z
      print i
      box=boxes[by,bx,j,:]
      x= (bx+box[0])*(im_w/7)
      y = (by+box[1])*(im_h/7)
      w = box[2]**2.0*im_w
      h = box[3]**2.0*im_h
      im_marked=cv2.rectangle(im_marked, (int(x-w/2), int( y-h/2)),( int(x+w/2), int(y+h/2)),[0,0,255],thickness=2)
      im_marked=cv2.rectangle(im_marked, (int(x-w/2), int( y-h/2)),( int(x-w/2+100), int(y-h/2+20)),[0,0,255],thickness=-1)
      cv2.putText(im_marked, classes[int(i)],(int(x-w/2+5),int(y-h/2+15)), cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,0),thickness=2)

      pos_size = [x, y, box[2]**2.0, box[3]**2.0]
      class_label = classes[int(i)]

  return im_marked, class_label, pos_size