【OCR学习笔记】6、OCR传统字符识别实践

简介: 【OCR学习笔记】6、OCR传统字符识别实践

1 简介


字符特征提取完成后,接下来的主要任务就是识别字符。传统的机器学习将这一任务转换为一个分类任务。针对该任务,一系列的分类方法模型出现,主要包括:支持向量机、近邻算法、多层感知器等


2 支持向量机


SVM原理请移步该文:

【机器学习算法】5、支持向量机算法

SVM字符分类代码如下:

# -*- coding: UTF-8 -*-
import cv2
import os
import numpy as np
from sklearn import svm
from PIL import Image
# 垂直投影
def verticle_projection(thresh1):
    (h, w) = thresh1.shape
    a = [0 for z in range(0, w)]
    # 记录每一列的波峰
    # 遍历一列
    for j in range(0, w):
        # 遍历一行
        for i in range(0, h):
            # 如果改点为黑点
            if thresh1[i, j] == 0:
                # 该列的计数器加一计数
                a[j] += 1
                # 记录完后将其变为白色
                thresh1[i, j] = 255
    # 遍历每一列
    for j in range(0, w):
        # 从该列应该变黑的最顶部的点开始向最底部涂黑
        for i in range((h - a[j]), h):
            # 涂黑
            thresh1[i, j] = 0
    # 存储所有分割出来的图片
    roi_list = list()
    start_index = 0
    end_index = 0
    in_block = False
    for i in range(0, w):
        if in_block == False and a[i] != 0:
            in_block = True
            start_index = i
        elif a[i] == 0 and in_block:
            end_index = i
            in_block = False
        roiImg = thresh1[0:h, start_index:end_index + 1]
        roi_list.append(roiImg)
    return roi_list
# 将二值化后的数组转化成网格特征统计图
def get_features(array):
    # 拿到数组的高度和宽度
    h, w = array.shape
    data = []
    for x in range(0, w / 4):
        offset_y = x * 4
        temp = []
        for y in range(0, h / 4):
            offset_x = y * 4
            # 统计每个区域的1的值
            sum_temp = array[0 + offset_y:4 + offset_y, 0 + offset_x:4 + offset_x]
            temp.append(sum(sum(sum_temp)))
        data.append(temp)
    return np.asarray(data)
def train_main():
    # 读取训练样本
    train_path = "../dataset/train/"
    train_files = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    train_X = []
    train_y = []
    for train_file in train_files:
        pictures = os.listdir(train_path + train_file)
        for picture in pictures:
            img = cv2.imread(train_path + train_file + "/" + picture)
            img = cv2.resize(img, (32, 32))
            gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            ret, thresh1 = cv2.threshold(
                gray_image, 130, 255, cv2.THRESH_BINARY)
            feature = get_features(thresh1)
            feature = feature.reshape(feature.shape[0] * feature.shape[1])
            train_X.append(feature)
            train_y.append(train_file)
            train_X = np.array(train_X)
            train_y = np.array(train_y)
    linearsvc_clf = svm.LinearSVC()
    linearsvc_clf.fit(train_X, train_y)
    return linearsvc_clf
def test_main(linearsvc_clf):
    # 原图
    img = cv2.imread("../dataset/test/idcard1.jpg")
    # 灰度图
    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 二值图
    ret, thresh1 = cv2.threshold(gray_image, 130, 255, cv2.THRESH_BINARY)
    # 垂直投影
    roi_list = verticle_projection(thresh1)
    test_X = []
    # 输入分类器
    for single in roi_list:
        single = cv2.resize(single, (32, 32), interpolation=cv2.INTER_CUBIC)
        feature = get_features(single)
        feature = feature.reshape(feature.shape[0] * feature.shape[1])
        test_X.append(feature)
    test_X = np.array(test_X)
    result = linearsvc_clf.predict(test_X)
    print(result)
if __name__ == '__main__':
    linearsvc_clf = train_main()
    test_main(linearsvc_clf)


3 近邻算法


近邻算法原理请移步该文:

【机器学习算法】3、K-近邻算法

近邻算法字符分类代码如下:

# -*- coding: UTF-8 -*-
import cv2
import os
import numpy as np
from sklearn import neighbors
from PIL import Image
# 垂直投影
def verticle_projection(thresh1):
    (h, w) = thresh1.shape
    a = [0 for z in range(0, w)]
    # 记录每一列的波峰
    # 遍历一列
    for j in range(0, w):
        # 遍历一行
        for i in range(0, h):
            # 如果改点为黑点
            if thresh1[i, j] == 0:
                # 该列的计数器加一计数
                a[j] += 1
                # 记录完后将其变为白色
                thresh1[i, j] = 255
    # 遍历每一列
    for j in range(0, w):
        # 从该列应该变黑的最顶部的点开始向最底部涂黑
        for i in range((h - a[j]), h):
            # 涂黑
            thresh1[i, j] = 0
    # 存储所有分割出来的图片
    roi_list = list()
    start_index = 0
    end_index = 0
    in_block = False
    for i in range(0, w):
        if in_block == False and a[i] != 0:
            in_block = True
            start_index = i
        elif a[i] == 0 and in_block:
            end_index = i
            in_block = False
        roiImg = thresh1[0:h, start_index:end_index + 1]
        roi_list.append(roiImg)
    return roi_list
# 将二值化后的数组转化成网格特征统计图
def get_features(array):
    # 拿到数组的高度和宽度
    h, w = array.shape
    data = []
    for x in range(0, w / 4):
        offset_y = x * 4
        temp = []
        for y in range(0, h / 4):
            offset_x = y * 4
            # 统计每个区域的1的值
            sum_temp = array[0 + offset_y:4 + offset_y, 0 + offset_x:4 + offset_x]
            temp.append(sum(sum(sum_temp)))
        data.append(temp)
    return np.asarray(data)
def train_main():
    # 读取训练样本
    train_path = "../dataset/train/"
    train_files = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    train_X = []
    train_y = []
    for train_file in train_files:
        pictures = os.listdir(train_path + train_file)
        for picture in pictures:
            img = cv2.imread(train_path + train_file + "/" + picture)
            img = cv2.resize(img, (32, 32))
            gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            ret, thresh1 = cv2.threshold(
                gray_image, 130, 255, cv2.THRESH_BINARY)
            feature = get_features(thresh1)
            feature = feature.reshape(feature.shape[0] * feature.shape[1])
            train_X.append(feature)
            train_y.append(train_file)
    train_X = np.array(train_X)
    train_y = np.array(train_y)
    knn_clf = neighbors.KNeighborsClassifier()
    knn_clf.fit(train_X, train_y)
    return knn_clf
def test_main(knn_clf):
    # 原图
    img = cv2.imread("../dataset/test/idcard1.jpg")
    # 灰度图
    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 二值图
    ret, thresh1 = cv2.threshold(gray_image, 130, 255, cv2.THRESH_BINARY)
    # 垂直投影
    roi_list = verticle_projection(thresh1)
    test_X = []
    # 输入分类器
    for single in roi_list:
        single = cv2.resize(single, (32, 32), interpolation=cv2.INTER_CUBIC)
        feature = get_features(single)
        feature = feature.reshape(feature.shape[0] * feature.shape[1])
        test_X.append(feature)
    test_X = np.array(test_X)
    result = knn_clf.predict(test_X)
    print(result)
if __name__ == '__main__':
    knn_clf = train_main()
    test_main(knn_clf)


4 多层感知器


多层感知器是一种常见的人工神经网络模型,主要包括:输入层隐含层输出层三个部分。每个MLP模型均可包含一个或者多个隐含层。最简单的MLP是一种三层结构,如图所示:

可以看出,无论输入层和隐含层之间,还是隐含层和输出层之间都是全连接的。输入层,顾名思义,就是原始数据的输入,比如输入一个n维的向量,那么输入有n个神经元。隐含层是对输入层的数据进行一定的运算后得到的,具体公式如下:

其中,表示权重参数,表示偏置项,为激活函数,常用的有函数和函数。相应的公式分别为:

fdb7be068560c3ac8afa6e4ab7a58df1.png

最后是输出层,输出层和隐含层之间可以看作是一个类似于多类别逻辑回归的映射关系,也就是回归。所以,输出层最终的输出可以表示为:

其中,为上面提到的隐含层的输出,为权重参数,为偏置项。将上述的公式整合,可以得到上述三层MLP最终的输出层输出公式:

c849803d7aae8d1ca9ea36904bb3c3d7.png

近邻算法字符分类代码如下:

# -*- coding: UTF-8 -*-
import cv2
import os
import numpy as np
from sklearn.neural_network import MLPClassifier
from PIL import Image
# 垂直投影
def verticle_projection(thresh1):
    (h, w) = thresh1.shape
    a = [0 for z in range(0, w)]
    # 记录每一列的波峰
    # 遍历一列
    for j in range(0, w):
        # 遍历一行
        for i in range(0, h):
            # 如果改点为黑点
            if thresh1[i, j] == 0:
                # 该列的计数器加一计数
                a[j] += 1
                # 记录完后将其变为白色
                thresh1[i, j] = 255
    # 遍历每一列
    for j in range(0, w):
        # 从该列应该变黑的最顶部的点开始向最底部涂黑
        for i in range((h - a[j]), h):
            # 涂黑
            thresh1[i, j] = 0
    # 存储所有分割出来的图片
    roi_list = list()
    start_index = 0
    end_index = 0
    in_block = False
    for i in range(0, w):
        if in_block == False and a[i] != 0:
            in_block = True
            start_index = i
        elif a[i] == 0 and in_block:
            end_index = i
            in_block = False
        roiImg = thresh1[0:h, start_index:end_index + 1]
        roi_list.append(roiImg)
    return roi_list
# 将二值化后的数组转化成网格特征统计图
def get_features(array):
    # 拿到数组的高度和宽度
    h, w = array.shape
    data = []
    for x in range(0, w / 4):
        offset_y = x * 4
        temp = []
        for y in range(0, h / 4):
            offset_x = y * 4
            # 统计每个区域的1的值
            sum_temp = array[0 + offset_y:4 + offset_y, 0 + offset_x:4 + offset_x]
            temp.append(sum(sum(sum_temp)))
        data.append(temp)
    return np.asarray(data)
def train_main():
    # 读取训练样本
    train_path = "../dataset/train/"
    train_files = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    train_X = []
    train_y = []
    for train_file in train_files:
        pictures = os.listdir(train_path + train_file)
        for picture in pictures:
            img = cv2.imread(train_path + train_file + "/" + picture)
            img = cv2.resize(img, (32, 32))
            gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            ret, thresh1 = cv2.threshold(gray_image, 130, 255, cv2.THRESH_BINARY)
            feature = get_features(thresh1)
            feature = feature.reshape(feature.shape[0] * feature.shape[1])
            train_X.append(feature)
            train_y.append(train_file)
    train_X = np.array(train_X)
    train_y = np.array(train_y)
    mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 32), random_state=1)
    mlp_clf.fit(train_X, train_y)
    return mlp_clf
def test_main(mlp_clf):
    # 原图
    img = cv2.imread("../dataset/test/idcard1.jpg")
    # 灰度图
    gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 二值图
    ret, thresh1 = cv2.threshold(gray_image, 130, 255, cv2.THRESH_BINARY)
    # 垂直投影
    roi_list = verticle_projection(thresh1)
    test_X = []
    # 输入分类器
    for single in roi_list:
        single = cv2.resize(single, (32, 32), interpolation=cv2.INTER_CUBIC)
        feature = get_features(single)
        feature = feature.reshape(feature.shape[0] * feature.shape[1])
        test_X.append(feature)
    test_X = np.array(test_X)
    result = mlp_clf.predict(test_X)
    print(result)
if __name__ == '__main__':
    mlp_clf = train_main()
    test_main(mlp_clf)

相关文章
|
3天前
|
文字识别 前端开发 JavaScript
Star33.1k!推荐一个基于网页的OCR(光学字符识别)引擎库
想要在前端解决图像识别的兄弟,可以到 Github 上下载Tesseract.js库,安装和相关学习文档都能下载到,实在获取不到的兄弟找V哥发给你,假期第二天,出去放松的同时也可以看看 V 哥的文章,祝大家玩得开心。
|
3月前
|
文字识别 数据可视化 PyTorch
OCR-字符识别笔记
OCR-字符识别笔记
33 0
|
5月前
|
机器学习/深度学习 文字识别 算法
[Halcon&识别] OCR字符识别
[Halcon&识别] OCR字符识别
78 0
|
11月前
|
人工智能 文字识别 API
OCR(Optical Character Recognition,光学字符识别)
OCR(Optical Character Recognition,光学字符识别)是一种将图像中的文字转换成可编辑文本的技术。OCR 技术可以应用于各种场景,例如自动化办公、图像文本识别、车牌识别、身份证识别、发票识别等。
149 1
|
11月前
|
文字识别 算法 计算机视觉
MATLAB实现OCR识别数字和字符
OCR也叫做光学字符识别,是计算机视觉研究领域的分支之一。它是利用光学技术和计算机技术把印在或写在纸上的文字读取出来,并转换成一种计算机能够接受、人又可以理解的格式。
|
11月前
|
机器学习/深度学习 人工智能 文字识别
深度学习应用篇-计算机视觉-OCR光学字符识别[7]:OCR综述、常用CRNN识别方法、DBNet、CTPN检测方法等、评估指标、应用场景
深度学习应用篇-计算机视觉-OCR光学字符识别[7]:OCR综述、常用CRNN识别方法、DBNet、CTPN检测方法等、评估指标、应用场景
深度学习应用篇-计算机视觉-OCR光学字符识别[7]:OCR综述、常用CRNN识别方法、DBNet、CTPN检测方法等、评估指标、应用场景
|
12月前
|
文字识别
【OCR学习笔记】9、OCR中文项目综合实践(CTPN+CRNN+CTC Loss原理讲解)(三)
【OCR学习笔记】9、OCR中文项目综合实践(CTPN+CRNN+CTC Loss原理讲解)(三)
177 0
|
12月前
|
机器学习/深度学习 文字识别 算法
【OCR学习笔记】9、OCR中文项目综合实践(CTPN+CRNN+CTC Loss原理讲解)(二)
【OCR学习笔记】9、OCR中文项目综合实践(CTPN+CRNN+CTC Loss原理讲解)(二)
250 0
|
12月前
|
机器学习/深度学习 文字识别 算法
【OCR学习笔记】9、OCR中文项目综合实践(CTPN+CRNN+CTC Loss原理讲解)(一)
【OCR学习笔记】9、OCR中文项目综合实践(CTPN+CRNN+CTC Loss原理讲解)(一)
287 0
|
12月前
|
机器学习/深度学习 文字识别 PyTorch
【OCR学习笔记】8、OCR移动端网络汇总与PyTorch实现(二)
【OCR学习笔记】8、OCR移动端网络汇总与PyTorch实现(二)
141 0

热门文章

最新文章