Skip to content

Instantly share code, notes, and snippets.

@mseri
Created June 18, 2025 22:03
Show Gist options
  • Select an option

  • Save mseri/7427b254f4d184e5f901699469bd51eb to your computer and use it in GitHub Desktop.

Select an option

Save mseri/7427b254f4d184e5f901699469bd51eb to your computer and use it in GitHub Desktop.
Small swift script to extract text from images (using Apple's Vision)
#!/usr/bin/swift
// Started from the code in https://terminalbytes.com/iphone-8-solar-powered-vision-ocr-server/
// Edited from Mistral generated code: https://chat.mistral.ai/chat/563cacdf-6def-49e4-9df6-ee8e263978c5
import AppKit
import CoreGraphics
import Foundation
import SwiftUI
import Vision
func processImageSync(imagePath: String) -> String? {
let fileURL = URL(fileURLWithPath: imagePath)
guard let imageSource = CGImageSourceCreateWithURL(fileURL as CFURL, nil),
let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil)
else {
return nil
}
let semaphore = DispatchSemaphore(value: 0)
var recognizedText: String?
let request = VNRecognizeTextRequest { request, error in
defer { semaphore.signal() }
guard let observations = request.results as? [VNRecognizedTextObservation] else {
return
}
recognizedText = observations.compactMap { observation in
observation.topCandidates(1).first?.string
}.joined(separator: "\n")
}
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
try? handler.perform([request])
semaphore.wait()
return recognizedText
}
struct ContentView: View {
@State private var image: NSImage? = nil
@State private var recognizedText = ""
var body: some View {
VStack(spacing: 20) {
ZStack {
RoundedRectangle(cornerRadius: 10)
.stroke(Color.gray, lineWidth: 2)
.frame(width: 300, height: 200)
.overlay(
Text("Drag and Drop Image Here")
.foregroundColor(.gray)
)
if image != nil {
Image(nsImage: image!)
.resizable()
.scaledToFit()
.frame(width: 300, height: 200)
}
}
.onDrop(of: ["public.file-url"], isTargeted: nil) { providers -> Bool in
providers.first?.loadItem(forTypeIdentifier: "public.file-url", options: nil) {
(data, error) in
DispatchQueue.main.async {
if let data = data as? Data,
let url = URL(dataRepresentation: data, relativeTo: nil, isAbsolute: true)
{
if let loadedImage = NSImage(contentsOf: url) {
self.image = loadedImage
recognizeText(from: loadedImage)
}
}
}
}
return true
}
TextEditor(text: $recognizedText)
.frame(
minWidth: 300, idealWidth: 300, maxWidth: .infinity,
minHeight: 200, idealHeight: 200, maxHeight: .infinity, alignment: .center
)
.border(Color.gray, width: 1)
HStack(spacing: 20) {
Button(action: {
image = nil
recognizedText = ""
}) {
Text("Clear")
.padding()
.frame(maxWidth: .infinity)
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
}
.buttonStyle(PlainButtonStyle()) // Ensure the entire button area is clickable
Button(action: {
copyToClipboard(text: recognizedText)
}) {
Text("Copy Text")
.padding()
.frame(maxWidth: .infinity)
.background(Color.green)
.foregroundColor(.black)
.cornerRadius(8)
}
.buttonStyle(PlainButtonStyle()) // Ensure the entire button area is clickable
Button(action: {
NSApplication.shared.terminate(nil)
}) {
Text("Close App")
.padding()
.frame(maxWidth: .infinity)
.background(Color.red)
.foregroundColor(.white)
.cornerRadius(8)
}
.buttonStyle(PlainButtonStyle()) // Ensure the entire button area is clickable
}
}
.padding()
}
func recognizeText(from nsImage: NSImage) {
guard let cgImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
recognizedText = "ERROR: Failed to process image."
return
}
let request = VNRecognizeTextRequest { request, error in
guard let observations = request.results as? [VNRecognizedTextObservation] else {
DispatchQueue.main.async {
self.recognizedText = "WARNING: No text recognized."
}
return
}
let text = observations.compactMap { observation in
observation.topCandidates(1).first?.string
}.joined(separator: "\n")
DispatchQueue.main.async {
self.recognizedText = text
}
}
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
try? handler.perform([request])
}
func copyToClipboard(text: String) {
let pasteboard = NSPasteboard.general
pasteboard.clearContents()
pasteboard.setString(text, forType: .string)
}
}
struct GUIApp: App {
init() {
NSApplication.shared.activate(ignoringOtherApps: true)
}
var body: some Scene {
WindowGroup {
ContentView().onDisappear {
// window was closed
NSApplication.shared.terminate(nil)
}
}
}
}
func runCommandLine() {
guard CommandLine.arguments.count > 1 else {
print("Usage: swift TextRecognizer.swift <image-path> or swift TextRecognizer.swift --gui")
return
}
let imagePath = CommandLine.arguments[1]
if let recognizedText = processImageSync(imagePath: imagePath) {
print(recognizedText)
} else {
print("ERROR")
exit(1)
}
}
if CommandLine.arguments.contains("--gui") {
GUIApp.main()
} else {
runCommandLine()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment