Skip to content

Instantly share code, notes, and snippets.

@ChristianSchwarz
Last active June 5, 2019 12:40
Show Gist options
  • Select an option

  • Save ChristianSchwarz/8e142f77298dbf4113df05d215533d93 to your computer and use it in GitHub Desktop.

Select an option

Save ChristianSchwarz/8e142f77298dbf4113df05d215533d93 to your computer and use it in GitHub Desktop.
import net.sourceforge.tess4j.Tesseract
import net.sourceforge.tess4j.util.PdfUtilities
import java.io.File
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicInteger
fun main(args: Array<String>) {
val files = PdfUtilities.convertPdf2Png(File("""D:\git-p\pdf-ocr\src\main\kotlin\11437.pdf"""))
val pages = ConcurrentHashMap<Int, String>()
val executor = Executors.newFixedThreadPool(8)
val done = AtomicInteger()
files.forEachIndexed { index, file ->
executor.submit {
val t = Tesseract()
val tess = File("""C:\Program Files\Tesseract-OCR\tessdata""")
t.setDatapath(tess.absolutePath)
t.setLanguage("rus")
//t.setHocr(true)
t.setPageSegMode(2)
var text = t.doOCR(file)
text = text.replace("-\n", "")
pages[index] = text
val d = done.incrementAndGet()
println(" ${(((100.0 / files.size) * d).toInt())}%")
}
}
executor.shutdown()
executor.awaitTermination(1, TimeUnit.DAYS)
val sortedPages = pages.toSortedMap()
sortedPages.forEach { page: Int, text: String ->
println(
"""----------------------------------- $page ------------------------------------
$text""".trimMargin()
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment