实现通过摄像头实时扫描并将识别的文字实时叠加在屏幕上的效果,可以结合 Apple 的 AVFoundation 和 Vision 框架完成。以下是完整实现步骤:
功能描述
使用 AVCaptureSession 打开摄像头捕获实时视频流。
使用 Vision 框架 的 VNRecognizeTextRequest 对每一帧视频进行文字识别。
将识别到的文字叠加显示在屏幕上的 UILabel 中。
实现代码
import UIKit
import AVFoundation
import Vision
class LiveTextRecognitionViewController: UIViewController {
private var captureSession: AVCaptureSession!
private var previewLayer: AVCaptureVideoPreviewLayer!
private var detectedTextLabel: UILabel!
private var textRequest: VNRecognizeTextRequest!
override func viewDidLoad() {
super.viewDidLoad()
view.backgroundColor = .black
// 配置摄像头
setupCamera()
// 配置实时显示的 UILabel
setupDetectedTextLabel()
// 配置 Vision 请求
setupTextRecognitionRequest()
}
private func setupCamera() {
captureSession = AVCaptureSession()
captureSession.sessionPreset = .high
guard let videoDevice = AVCaptureDevice.default(for: .video),
let videoInput = try? AVCaptureDeviceInput(device: videoDevice) else {
print("无法访问摄像头")
return
}
if captureSession.canAddInput(videoInput) {
captureSession.addInput(videoInput)
}
let videoOutput = AVCaptureVideoDataOutput()
videoOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "videoFrameProcessingQueue"))
if captureSession.canAddOutput(videoOutput) {
captureSession.addOutput(videoOutput)
}
// 设置预览图层
previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
previewLayer.videoGravity = .resizeAspectFill
previewLayer.frame = view.bounds
view.layer.addSublayer(previewLayer)
// 将 startRunning 放到后台线程
DispatchQueue.global(qos: .background).async {
self.captureSession.startRunning()
}
}
private func setupDetectedTextLabel() {
detectedTextLabel = UILabel()
detectedTextLabel.numberOfLines = 0
detectedTextLabel.textColor = .yellow
detectedTextLabel.backgroundColor = UIColor.black.withAlphaComponent(0.5)
detectedTextLabel.textAlignment = .center
detectedTextLabel.translatesAutoresizingMaskIntoConstraints = false
view.addSubview(detectedTextLabel)
NSLayoutConstraint.activate([
detectedTextLabel.leadingAnchor.constraint(equalTo: view.leadingAnchor, constant: 16),
detectedTextLabel.trailingAnchor.constraint(equalTo: view.trailingAnchor, constant: -16),
detectedTextLabel.bottomAnchor.constraint(equalTo: view.bottomAnchor, constant: -40),
detectedTextLabel.heightAnchor.constraint(equalToConstant: 120)
])
}
private func setupTextRecognitionRequest() {
textRequest = VNRecognizeTextRequest { [weak self] (request, error) in
guard let self = self else { return }
if let error = error {
print("文本识别出错: \(error)")
return
}
self.processTextRecognitionResults(request.results)
}
textRequest.recognitionLevel = .accurate
textRequest.recognitionLanguages = ["en-US", "zh-Hans"]
textRequest.usesLanguageCorrection = true
}
private func processTextRecognitionResults(_ results: [Any]?) {
guard let results = results as? [VNRecognizedTextObservation] else { return }
let detectedText = results.compactMap { observation in
observation.topCandidates(1).first?.string
}.joined(separator: "\n")
DispatchQueue.main.async {
self.detectedTextLabel.text = detectedText.isEmpty ? "未检测到文字" : detectedText
}
}
}
extension LiveTextRecognitionViewController: AVCaptureVideoDataOutputSampleBufferDelegate {
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:])
do {
try handler.perform([textRequest])
} catch {
print("文本识别请求处理失败: \(error)")
}
}
}
代码解析
AVCaptureSession:
AVCaptureSession 用于实时捕获摄像头视频流。
使用 AVCaptureVideoDataOutput 捕获每一帧视频并进行处理。
Vision Text Recognition:
配置 VNRecognizeTextRequest 以进行文字识别。
支持多语言识别:配置 recognitionLanguages 为 ["en-US", "zh-Hans"],同时支持中文和英文。
实时显示文本:
将识别到的文字拼接成字符串,通过 UILabel 动态更新到屏幕上。
线程处理:
使用 DispatchQueue 在后台处理视频帧。
通过 DispatchQueue.main.async 更新 UI,确保线程安全。