50 行純 python code 做中文手寫

Posted by TJ Wei on 星期六, 6月 25, 2011 with No comments

用 50 行 python 3.2 source code 實作中文手寫辨識的核心,外加訓練也是五十行以內,演算法同 Zinnia。完整 Source Code

辨識核心的 source
from itertools import chain
from math import atan2, hypot

def farthestVertex(path):
  first,last = path[0],path[-1]
  if first == last:
      return 0.0, 0
  a, b = first[1]-last[1], last[0]-first[0]
  c = a*first[0]+b*first[1]
  maxf, bestj=max((abs(a*x+b*y-c),j) for j,(x,y) in enumerate(path))
  return bestj, maxf**2/hypot(a,b)

def split(path, i, Error=0.001, kMaxCharacters=50):# split path to segments
    if i > kMaxCharacters:
      return []
    iter0=((i,(path[0],path[-1])),)
    j, dist = farthestVertex(path)
    return iter0 if dist <=Error else \
        chain(iter0, split(path[:j+1], i*2+1), split(path[j:], i*2+2))

def toFeature(seg):
  a, b =((x-0.5, y-0.5) for (x,y) in seg)
  return (      10*hypot(b[0]-a[0], b[1]-a[1]), # 10 * length
                   atan2(b[1]-a[1], b[0]-a[0]), #  1 * direction
            10*a[0], 10*a[1], 10*b[0], 10*b[1], # 10 * absolute position  
          atan2(a[1], a[0]), atan2(b[1], b[0]), #  1 * absolute degree  
    10*hypot(a[0], a[1]), 10*hypot(b[0], b[1]), # 10 * absolute distance  
                  5*(b[0]-a[0]), 5*(b[1]-a[1])) #  5 * diff

def getFeatures(dim, strokes):   
    fstrokes=([(x/dim[0],y/dim[1]) for (x,y) in stk] for stk in strokes)
    prev=None
    x={}
    for sid, path in enumerate(fstrokes):
        for i,n in split(path, 0): #segments feature
            x.update(enumerate(toFeature(n), sid*1000+20*i+1))
        if prev: # movement feature
          x.update(enumerate(toFeature((prev, path[0])), sid*1000+100001))
        prev=path[-1]
    x[2000000],x[2000001+sid]=sid+1, 10
    return x

def recognize(model, dim, strokes, nbest=10, best=[]):
    x=getFeatures(dim, strokes)
    for c, b, w in model:
        v=(b+sum(f*x[i] for i,f in w if i in x), c)
        if len(best)<nbest or best[-1]<v:
          best=sorted(best+[v], reverse=True)[:nbest]
    return best
SVM 訓練的 source
import random
from collections import defaultdict

def svm_train(ys, xs, C, kINF=10.0**37, kSMALL=10.0**-12, kEPS=0.1):    
    PGmax_old, PGmin_old = -kINF, kINF
    w=defaultdict(float)
    alpha=[0.0]*len(xs)
    QD=[sum(v*v for i,v in x) for x in xs]
    index=index0=range(len(xs))    
    for itr in range(2000):
        if itr%4==0:
            print(".", end='')
        PGmax_new, PGmin_new=-kINF, kINF
        index_new=[]
        for i in random.sample(index, len(index)):
            PG=sum(w[j]*v for j,v in xs[i] if j in w)*ys[i]-1            
            if alpha[i]==0.0:
                if PG> PGmax_old:
                    continue
                PG=min(PG, 0.0)                    
            elif alpha[i]==C:
                if PG<PGmin_old:
                    continue
                PG=max(PG, 0.0)                
            index_new.append(i)
            PGmax_new=max(PGmax_new, PG)
            PGmin_new=min(PGmin_new, PG)
            if abs(PG) > kSMALL:
                alpha_old=alpha[i]
                alpha[i]=sorted((0.0, alpha[i] - PG/QD[i], C))[1]
                d=(alpha[i] - alpha_old)* ys[i]
                w.update((j, w[j]+d*v) for j,v in xs[i])                    
        index=index_new
        if PGmax_new-PGmin_new <=kEPS:
            if len(index)==len(xs):
                break
            index=index0
            PGmax_old, PGmin_old=kINF, -kINF
        else:        
            PGmax_old = PGmax_new if PGmax_old > 0 else kINF
            PGmin_old = PGmin_new if PGmin_old <0 else -kINF
    return w
Categories: