Last active
August 17, 2022 06:06
-
-
Save sankalpdeveloper/948b10b09493a7c12067b5941bf8d7ba to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| static async startreadTextAsync(s3_file) { | |
| try { | |
| var params = { | |
| DocumentLocation: { | |
| S3Object: { | |
| Bucket: s3_file.Bucket, | |
| Name: s3_file.Key, | |
| }, | |
| }, | |
| FeatureTypes: [ | |
| 'FORMS', 'QUERIES' | |
| ], | |
| OutputConfig: { | |
| S3Bucket: 'serial_number', | |
| S3Prefix: 'output' | |
| }, | |
| QueriesConfig: { | |
| "Queries": [ | |
| { | |
| "Text": "what is serial number ?", | |
| "Alias": "SERIAL_NUMBER", | |
| "Pages": [ | |
| "1" | |
| ] | |
| } | |
| ] | |
| } | |
| } | |
| const command = new StartDocumentAnalysisCommand(params); | |
| const data = await textract.send(command); | |
| const JobID = data.JobId | |
| return JobID | |
| } catch (error) { | |
| console.log("Error in Read Text Async", error); | |
| throw new Error(error) | |
| } | |
| } | |
| static async getreadTextAsync(jobid) { | |
| try { | |
| var params = { | |
| JobId: jobid, | |
| } | |
| var res = [] | |
| var document_text | |
| var raw_textract_data = [] | |
| var loopCtrl = true | |
| let command = new GetDocumentAnalysisCommand(params); | |
| document_text = await textract.send(command); | |
| raw_textract_data.push(document_text.Blocks) | |
| if (document_text.JobStatus === 'SUCCEEDED') { | |
| var temp_anylyze = await praseTextractData(document_text); | |
| res.push(temp_anylyze) | |
| while (loopCtrl) { | |
| if (document_text.JobStatus === 'SUCCEEDED') { | |
| if (document_text.NextToken) { | |
| var params = { | |
| JobId: jobid, | |
| NextToken: document_text.NextToken | |
| } | |
| let command = new GetDocumentAnalysisCommand(params); | |
| document_text = await textract.send(command); | |
| raw_textract_data.push(document_text.Blocks) | |
| temp_anylyze = await praseTextractData(document_text); | |
| res.push(temp_anylyze) | |
| } else { | |
| loopCtrl = false | |
| } | |
| } | |
| } | |
| return { res, raw_textract_data, status: document_text.JobStatus } | |
| } | |
| return { status: document_text.JobStatus } | |
| } catch (error) { | |
| console.log("Error in Read Text Async", error); | |
| throw new Error(error) | |
| } | |
| } | |
| const { TextractDocument } = require("amazon-textract-response-parser"); | |
| const _ = require('lodash') | |
| module.exports = class TextractPraser { | |
| static async praseTextractData(data) { | |
| const doc = new TextractDocument(data); | |
| var tableData = {} | |
| var textData = data['Blocks'] | |
| var queryresult = [] | |
| const getQueries = () => { | |
| var query_block = {} | |
| var query_result_block = {} | |
| for (let index = 0; index < textData.length; index++) { | |
| const element = textData[index]; | |
| if (element['BlockType'] == "QUERY") { | |
| if (element.Relationships) { | |
| if (element.Relationships[0].Type == 'ANSWER') { | |
| query_block[element.Id] = { rel: element.Relationships[0], alias: element.Query.Alias, question: element.Query.Text, } | |
| } | |
| } else { | |
| query_block[element.Id] = { rel: null, alias: element.Query.Alias,question: element.Query.Text } | |
| } | |
| } | |
| if (element['BlockType'] == "QUERY_RESULT") { | |
| query_result_block[element.Id] = element['Text'] | |
| } | |
| } | |
| for (const property in query_block) { | |
| try { | |
| if (query_result_block[query_block[property].rel.Ids]) { | |
| let result = { alias: query_block[property].alias, question: query_block[property].question, result: query_result_block[query_block[property].rel.Ids] } | |
| queryresult.push(result) | |
| } | |
| } catch (error) { | |
| let result = { alias: query_block[property].alias, question: query_block[property].question, result: undefined } | |
| queryresult.push(result) | |
| } | |
| } | |
| } | |
| try { | |
| getQueries() | |
| } catch (error) { | |
| console.log("error in query", error); | |
| } | |
| const get_text = (result, block_map) => { | |
| let text = '' | |
| if (result['Relationships']) { | |
| for (const relationship of result['Relationships']) { | |
| if (relationship['Type'] == 'CHILD') { | |
| for (const child_id of relationship['Ids']) { | |
| let word = block_map[child_id] | |
| if (word) { | |
| if (word['BlockType'] == 'WORD') { | |
| text = text + word['Text'] + ' ' | |
| } | |
| if (word['BlockType'] == 'SELECTION_ELEMENT') { | |
| if (word['SelectionStatus'] == 'SELECTED') { | |
| text += 'X ' | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return text | |
| } | |
| // TABLE DATA LOGIC | |
| var _pj; | |
| function _pj_snippets(container) { | |
| function in_es6(left, right) { | |
| if (right instanceof Array || typeof right === "string") { | |
| return right.indexOf(left) > -1; | |
| } else { | |
| if (right instanceof Map || right instanceof Set || right instanceof WeakMap || right instanceof WeakSet) { | |
| return right.has(left); | |
| } else { | |
| return left in right; | |
| } | |
| } | |
| } | |
| container["in_es6"] = in_es6; | |
| return container; | |
| } | |
| _pj = {}; | |
| _pj_snippets(_pj); | |
| function get_rows_columns_map(table_result, blocks_map) { | |
| var cell, col_index, row_index, rows; | |
| rows = {}; | |
| var tempArry = {} | |
| for (var relationship, _pj_c = 0, _pj_a = table_result["Relationships"], _pj_b = _pj_a.length; _pj_c < _pj_b; _pj_c += 1) { | |
| relationship = _pj_a[_pj_c]; | |
| if (relationship["Type"] === "CHILD") { | |
| for (var child_id, _pj_f = 0, _pj_d = relationship["Ids"], _pj_e = _pj_d.length; _pj_f < _pj_e; _pj_f += 1) { | |
| child_id = _pj_d[_pj_f]; | |
| cell = blocks_map[child_id]; | |
| try { | |
| if (cell["BlockType"] === "CELL") { | |
| row_index = cell["RowIndex"]; | |
| col_index = cell["ColumnIndex"]; | |
| if (!_pj.in_es6(row_index, rows)) { | |
| // console.log("Error", row_index); | |
| rows[row_index] = {}; | |
| tempArry[row_index] = [] | |
| } | |
| // console.log("row_index", row_index); | |
| // console.log("col_index", col_index); | |
| // console.log("get_text", get_text(cell, blocks_map)); | |
| try { | |
| tempArry[row_index][col_index] = get_text(cell, blocks_map) | |
| } catch (error) { | |
| } | |
| rows[row_index][col_index] = get_text(cell, blocks_map); | |
| } | |
| } catch (error) { | |
| // console.log("Error", error); | |
| } | |
| } | |
| } | |
| } | |
| // console.log("filtered", filtered); | |
| // console.log("tempArry", tempArry); | |
| let compactArr = [] | |
| for (const item in tempArry) { | |
| const filtered = _.compact(tempArry[item]); | |
| compactArr.push(filtered) | |
| } | |
| return compactArr; | |
| } | |
| const get_table_csv_results = () => { | |
| let blocks_map = {} | |
| let table_blocks = [] | |
| for (let index = 0; index < textData.length; index++) { | |
| try { | |
| let block = textData[index] | |
| blocks_map[block['Id']] = block | |
| if (block['BlockType'] == "TABLE") { | |
| table_blocks.push(block) | |
| } | |
| } catch (error) { | |
| // console.log("Error", error); | |
| } | |
| } | |
| if (table_blocks.length <= 0) { | |
| return "<b> No Table FOUND </b>" | |
| } | |
| let csv = '' | |
| for (const [index, element] of table_blocks.entries()) { | |
| csv += generate_table_csv(element, blocks_map, index + 1) | |
| csv += '\n\n' | |
| } | |
| return csv | |
| } | |
| const generate_table_csv = (table_result, blocks_map, table_index) => { | |
| let rows = get_rows_columns_map(table_result, blocks_map) | |
| // console.log("rows", rows); | |
| tableData[table_index] = rows | |
| } | |
| get_table_csv_results() | |
| // KEY VALUE PAIR LOGIC | |
| const get_kv_map = () => { | |
| try { | |
| var key_map = {} | |
| var value_map = {} | |
| var block_map = {} | |
| textData.forEach(element => { | |
| let block_id = element['Id'] | |
| block_map[block_id] = element | |
| if (element['BlockType'] == "KEY_VALUE_SET") { | |
| if (element['EntityTypes'] == "KEY") { | |
| key_map[block_id] = element | |
| } else { | |
| value_map[block_id] = element | |
| } | |
| } | |
| }); | |
| } catch (error) { | |
| // console.log("Error", error); | |
| } | |
| return { key_map, value_map, block_map } | |
| } | |
| const get_kv_relationship = (key_map, value_map, block_map) => { | |
| let kvs = {} | |
| for (const key in key_map) { | |
| try { | |
| if (Object.hasOwnProperty.call(key_map, key)) { | |
| const element = key_map[key]; | |
| let value_block = find_value_block(element, value_map) | |
| let key_1 = get_text(key_map[key], block_map) | |
| let val = get_text(value_block, block_map) | |
| kvs[key_1] = val | |
| // console.log("value_block", value_block); | |
| } | |
| } catch (error) { | |
| // console.log("Error", error); | |
| } | |
| } | |
| return kvs | |
| } | |
| const find_value_block = (key_block, value_map) => { | |
| for (const relationship of key_block['Relationships']) { | |
| if (relationship['Type'] == 'VALUE') { | |
| for (const value_id of relationship['Ids']) { | |
| var value_block = value_map[value_id] | |
| return value_block | |
| } | |
| } | |
| } | |
| } | |
| const { key_map, value_map, block_map } = get_kv_map() | |
| const kvData = get_kv_relationship(key_map, value_map, block_map) | |
| var rawData = '' | |
| for (const page of doc.iterPages()) { | |
| // (In Textract's output order...) | |
| for (const line of page.iterLines()) { | |
| for (const word of line.iterWords()) { | |
| rawData = rawData + " " + word.text | |
| } | |
| } | |
| } | |
| return { kvData, tableData, rawData, queryresult } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment