iOS使用摄像头实时扫描，并实时提取文字

实现通过摄像头实时扫描并将识别的文字实时叠加在屏幕上的效果，可以结合 Apple 的 AVFoundation 和 Vision 框架完成。以下是完整实现步骤：

功能描述

使用 AVCaptureSession 打开摄像头捕获实时视频流。
使用 Vision 框架 的 VNRecognizeTextRequest 对每一帧视频进行文字识别。
将识别到的文字叠加显示在屏幕上的 UILabel 中。

实现代码

import UIKit
import AVFoundation
import Vision

class LiveTextRecognitionViewController: UIViewController {
    
    private var captureSession: AVCaptureSession!
    private var previewLayer: AVCaptureVideoPreviewLayer!
    private var detectedTextLabel: UILabel!
    private var textRequest: VNRecognizeTextRequest!
    
    override func viewDidLoad() {
        super.viewDidLoad()
        view.backgroundColor = .black
        
        // 配置摄像头
        setupCamera()
        
        // 配置实时显示的 UILabel
        setupDetectedTextLabel()
        
        // 配置 Vision 请求
        setupTextRecognitionRequest()
    }
    
   private func setupCamera() {
    captureSession = AVCaptureSession()
    captureSession.sessionPreset = .high
    
    guard let videoDevice = AVCaptureDevice.default(for: .video),
          let videoInput = try? AVCaptureDeviceInput(device: videoDevice) else {
        print("无法访问摄像头")
        return
    }
    
    if captureSession.canAddInput(videoInput) {
        captureSession.addInput(videoInput)
    }
    
    let videoOutput = AVCaptureVideoDataOutput()
    videoOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "videoFrameProcessingQueue"))
    
    if captureSession.canAddOutput(videoOutput) {
        captureSession.addOutput(videoOutput)
    }
    
    // 设置预览图层
    previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
    previewLayer.videoGravity = .resizeAspectFill
    previewLayer.frame = view.bounds
    view.layer.addSublayer(previewLayer)
    
    // 将 startRunning 放到后台线程
    DispatchQueue.global(qos: .background).async {
        self.captureSession.startRunning()
    }
}

    private func setupDetectedTextLabel() {
        detectedTextLabel = UILabel()
        detectedTextLabel.numberOfLines = 0
        detectedTextLabel.textColor = .yellow
        detectedTextLabel.backgroundColor = UIColor.black.withAlphaComponent(0.5)
        detectedTextLabel.textAlignment = .center
        detectedTextLabel.translatesAutoresizingMaskIntoConstraints = false
        view.addSubview(detectedTextLabel)
        
        NSLayoutConstraint.activate([
            detectedTextLabel.leadingAnchor.constraint(equalTo: view.leadingAnchor, constant: 16),
            detectedTextLabel.trailingAnchor.constraint(equalTo: view.trailingAnchor, constant: -16),
            detectedTextLabel.bottomAnchor.constraint(equalTo: view.bottomAnchor, constant: -40),
            detectedTextLabel.heightAnchor.constraint(equalToConstant: 120)
        ])
    }
    
    private func setupTextRecognitionRequest() {
        textRequest = VNRecognizeTextRequest { [weak self] (request, error) in
            guard let self = self else { return }
            if let error = error {
                print("文本识别出错: \(error)")
                return
            }
            self.processTextRecognitionResults(request.results)
        }
        textRequest.recognitionLevel = .accurate
        textRequest.recognitionLanguages = ["en-US", "zh-Hans"]
        textRequest.usesLanguageCorrection = true
    }
    
    private func processTextRecognitionResults(_ results: [Any]?) {
        guard let results = results as? [VNRecognizedTextObservation] else { return }
        
        let detectedText = results.compactMap { observation in
            observation.topCandidates(1).first?.string
        }.joined(separator: "\n")
        
        DispatchQueue.main.async {
            self.detectedTextLabel.text = detectedText.isEmpty ? "未检测到文字" : detectedText
        }
    }
}

extension LiveTextRecognitionViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
        
        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:])
        do {
            try handler.perform([textRequest])
        } catch {
            print("文本识别请求处理失败: \(error)")
        }
    }
}

代码解析

AVCaptureSession:
AVCaptureSession 用于实时捕获摄像头视频流。
使用 AVCaptureVideoDataOutput 捕获每一帧视频并进行处理。

Vision Text Recognition:
配置 VNRecognizeTextRequest 以进行文字识别。
支持多语言识别：配置 recognitionLanguages 为 ["en-US", "zh-Hans"]，同时支持中文和英文。

实时显示文本:
将识别到的文字拼接成字符串，通过 UILabel 动态更新到屏幕上。

线程处理:
使用 DispatchQueue 在后台处理视频帧。
通过 DispatchQueue.main.async 更新 UI，确保线程安全。