Skip to content

Instantly share code, notes, and snippets.

@sankalpdeveloper
Last active August 17, 2022 06:06
Show Gist options
  • Select an option

  • Save sankalpdeveloper/948b10b09493a7c12067b5941bf8d7ba to your computer and use it in GitHub Desktop.

Select an option

Save sankalpdeveloper/948b10b09493a7c12067b5941bf8d7ba to your computer and use it in GitHub Desktop.
static async startreadTextAsync(s3_file) {
try {
var params = {
DocumentLocation: {
S3Object: {
Bucket: s3_file.Bucket,
Name: s3_file.Key,
},
},
FeatureTypes: [
'FORMS', 'QUERIES'
],
OutputConfig: {
S3Bucket: 'serial_number',
S3Prefix: 'output'
},
QueriesConfig: {
"Queries": [
{
"Text": "what is serial number ?",
"Alias": "SERIAL_NUMBER",
"Pages": [
"1"
]
}
]
}
}
const command = new StartDocumentAnalysisCommand(params);
const data = await textract.send(command);
const JobID = data.JobId
return JobID
} catch (error) {
console.log("Error in Read Text Async", error);
throw new Error(error)
}
}
static async getreadTextAsync(jobid) {
try {
var params = {
JobId: jobid,
}
var res = []
var document_text
var raw_textract_data = []
var loopCtrl = true
let command = new GetDocumentAnalysisCommand(params);
document_text = await textract.send(command);
raw_textract_data.push(document_text.Blocks)
if (document_text.JobStatus === 'SUCCEEDED') {
var temp_anylyze = await praseTextractData(document_text);
res.push(temp_anylyze)
while (loopCtrl) {
if (document_text.JobStatus === 'SUCCEEDED') {
if (document_text.NextToken) {
var params = {
JobId: jobid,
NextToken: document_text.NextToken
}
let command = new GetDocumentAnalysisCommand(params);
document_text = await textract.send(command);
raw_textract_data.push(document_text.Blocks)
temp_anylyze = await praseTextractData(document_text);
res.push(temp_anylyze)
} else {
loopCtrl = false
}
}
}
return { res, raw_textract_data, status: document_text.JobStatus }
}
return { status: document_text.JobStatus }
} catch (error) {
console.log("Error in Read Text Async", error);
throw new Error(error)
}
}
const { TextractDocument } = require("amazon-textract-response-parser");
const _ = require('lodash')
module.exports = class TextractPraser {
static async praseTextractData(data) {
const doc = new TextractDocument(data);
var tableData = {}
var textData = data['Blocks']
var queryresult = []
const getQueries = () => {
var query_block = {}
var query_result_block = {}
for (let index = 0; index < textData.length; index++) {
const element = textData[index];
if (element['BlockType'] == "QUERY") {
if (element.Relationships) {
if (element.Relationships[0].Type == 'ANSWER') {
query_block[element.Id] = { rel: element.Relationships[0], alias: element.Query.Alias, question: element.Query.Text, }
}
} else {
query_block[element.Id] = { rel: null, alias: element.Query.Alias,question: element.Query.Text }
}
}
if (element['BlockType'] == "QUERY_RESULT") {
query_result_block[element.Id] = element['Text']
}
}
for (const property in query_block) {
try {
if (query_result_block[query_block[property].rel.Ids]) {
let result = { alias: query_block[property].alias, question: query_block[property].question, result: query_result_block[query_block[property].rel.Ids] }
queryresult.push(result)
}
} catch (error) {
let result = { alias: query_block[property].alias, question: query_block[property].question, result: undefined }
queryresult.push(result)
}
}
}
try {
getQueries()
} catch (error) {
console.log("error in query", error);
}
const get_text = (result, block_map) => {
let text = ''
if (result['Relationships']) {
for (const relationship of result['Relationships']) {
if (relationship['Type'] == 'CHILD') {
for (const child_id of relationship['Ids']) {
let word = block_map[child_id]
if (word) {
if (word['BlockType'] == 'WORD') {
text = text + word['Text'] + ' '
}
if (word['BlockType'] == 'SELECTION_ELEMENT') {
if (word['SelectionStatus'] == 'SELECTED') {
text += 'X '
}
}
}
}
}
}
}
return text
}
// TABLE DATA LOGIC
var _pj;
function _pj_snippets(container) {
function in_es6(left, right) {
if (right instanceof Array || typeof right === "string") {
return right.indexOf(left) > -1;
} else {
if (right instanceof Map || right instanceof Set || right instanceof WeakMap || right instanceof WeakSet) {
return right.has(left);
} else {
return left in right;
}
}
}
container["in_es6"] = in_es6;
return container;
}
_pj = {};
_pj_snippets(_pj);
function get_rows_columns_map(table_result, blocks_map) {
var cell, col_index, row_index, rows;
rows = {};
var tempArry = {}
for (var relationship, _pj_c = 0, _pj_a = table_result["Relationships"], _pj_b = _pj_a.length; _pj_c < _pj_b; _pj_c += 1) {
relationship = _pj_a[_pj_c];
if (relationship["Type"] === "CHILD") {
for (var child_id, _pj_f = 0, _pj_d = relationship["Ids"], _pj_e = _pj_d.length; _pj_f < _pj_e; _pj_f += 1) {
child_id = _pj_d[_pj_f];
cell = blocks_map[child_id];
try {
if (cell["BlockType"] === "CELL") {
row_index = cell["RowIndex"];
col_index = cell["ColumnIndex"];
if (!_pj.in_es6(row_index, rows)) {
// console.log("Error", row_index);
rows[row_index] = {};
tempArry[row_index] = []
}
// console.log("row_index", row_index);
// console.log("col_index", col_index);
// console.log("get_text", get_text(cell, blocks_map));
try {
tempArry[row_index][col_index] = get_text(cell, blocks_map)
} catch (error) {
}
rows[row_index][col_index] = get_text(cell, blocks_map);
}
} catch (error) {
// console.log("Error", error);
}
}
}
}
// console.log("filtered", filtered);
// console.log("tempArry", tempArry);
let compactArr = []
for (const item in tempArry) {
const filtered = _.compact(tempArry[item]);
compactArr.push(filtered)
}
return compactArr;
}
const get_table_csv_results = () => {
let blocks_map = {}
let table_blocks = []
for (let index = 0; index < textData.length; index++) {
try {
let block = textData[index]
blocks_map[block['Id']] = block
if (block['BlockType'] == "TABLE") {
table_blocks.push(block)
}
} catch (error) {
// console.log("Error", error);
}
}
if (table_blocks.length <= 0) {
return "<b> No Table FOUND </b>"
}
let csv = ''
for (const [index, element] of table_blocks.entries()) {
csv += generate_table_csv(element, blocks_map, index + 1)
csv += '\n\n'
}
return csv
}
const generate_table_csv = (table_result, blocks_map, table_index) => {
let rows = get_rows_columns_map(table_result, blocks_map)
// console.log("rows", rows);
tableData[table_index] = rows
}
get_table_csv_results()
// KEY VALUE PAIR LOGIC
const get_kv_map = () => {
try {
var key_map = {}
var value_map = {}
var block_map = {}
textData.forEach(element => {
let block_id = element['Id']
block_map[block_id] = element
if (element['BlockType'] == "KEY_VALUE_SET") {
if (element['EntityTypes'] == "KEY") {
key_map[block_id] = element
} else {
value_map[block_id] = element
}
}
});
} catch (error) {
// console.log("Error", error);
}
return { key_map, value_map, block_map }
}
const get_kv_relationship = (key_map, value_map, block_map) => {
let kvs = {}
for (const key in key_map) {
try {
if (Object.hasOwnProperty.call(key_map, key)) {
const element = key_map[key];
let value_block = find_value_block(element, value_map)
let key_1 = get_text(key_map[key], block_map)
let val = get_text(value_block, block_map)
kvs[key_1] = val
// console.log("value_block", value_block);
}
} catch (error) {
// console.log("Error", error);
}
}
return kvs
}
const find_value_block = (key_block, value_map) => {
for (const relationship of key_block['Relationships']) {
if (relationship['Type'] == 'VALUE') {
for (const value_id of relationship['Ids']) {
var value_block = value_map[value_id]
return value_block
}
}
}
}
const { key_map, value_map, block_map } = get_kv_map()
const kvData = get_kv_relationship(key_map, value_map, block_map)
var rawData = ''
for (const page of doc.iterPages()) {
// (In Textract's output order...)
for (const line of page.iterLines()) {
for (const word of line.iterWords()) {
rawData = rawData + " " + word.text
}
}
}
return { kvData, tableData, rawData, queryresult }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment