Read Table in Document
This code example demonstrates how to use the IronTesseract OCR engine to extract text and table data from a PDF document.
- An instance of the IronTesseract OCR engine is created.
- An
OcrInput
object is initialized, and a PDF file ("table.pdf") is loaded using theLoadPdf
method. - The OCR engine processes the document using the
ReadDocumentAdvanced
method, which returns a more detailedOcrResult
object. - The first table found in the document is accessed using
result.Tables.First()
, and the cell information for that table is extracted withCellInfos
. - The list of cell data (
cellList
) now contains the table's cells, including the text content and other details (e.g., cell position, size). - This method is useful for extracting structured data like tables from PDFs, allowing the text within each table cell to be programmatically accessed and processed.
using IronOcr; // Include the IronOcr namespace for OCR functionality
using System.Linq; // Include System.Linq for using LINQ methods
class Program
{
static void Main()
{
// Create an instance of the IronTesseract OCR engine
var Ocr = new IronTesseract();
// Initialize an OcrInput object and load the PDF file
using var Input = new OcrInput();
Input.LoadPdf("table.pdf");
// Process the document to obtain a detailed OcrResult object
var Result = Ocr.ReadDocumentAdvanced(Input);
// Access the first table found in the document
if (Result.Tables.Any()) // Ensure there's at least one table detected
{
var firstTable = Result.Tables.First();
// Extract the cell information from the first table
var cellList = firstTable.CellInfos;
// Iterate over each cell and display its text content and details
foreach (var cell in cellList)
{
Console.WriteLine($"Cell Text: {cell.Text}");
Console.WriteLine($"Position: {cell.Bounds}");
Console.WriteLine($"Size: {cell.Size}");
}
}
}
}
using IronOcr; // Include the IronOcr namespace for OCR functionality
using System.Linq; // Include System.Linq for using LINQ methods
class Program
{
static void Main()
{
// Create an instance of the IronTesseract OCR engine
var Ocr = new IronTesseract();
// Initialize an OcrInput object and load the PDF file
using var Input = new OcrInput();
Input.LoadPdf("table.pdf");
// Process the document to obtain a detailed OcrResult object
var Result = Ocr.ReadDocumentAdvanced(Input);
// Access the first table found in the document
if (Result.Tables.Any()) // Ensure there's at least one table detected
{
var firstTable = Result.Tables.First();
// Extract the cell information from the first table
var cellList = firstTable.CellInfos;
// Iterate over each cell and display its text content and details
foreach (var cell in cellList)
{
Console.WriteLine($"Cell Text: {cell.Text}");
Console.WriteLine($"Position: {cell.Bounds}");
Console.WriteLine($"Size: {cell.Size}");
}
}
}
}
Imports IronOcr ' Include the IronOcr namespace for OCR functionality
Imports System.Linq ' Include System.Linq for using LINQ methods
Friend Class Program
Shared Sub Main()
' Create an instance of the IronTesseract OCR engine
Dim Ocr = New IronTesseract()
' Initialize an OcrInput object and load the PDF file
Dim Input = New OcrInput()
Input.LoadPdf("table.pdf")
' Process the document to obtain a detailed OcrResult object
Dim Result = Ocr.ReadDocumentAdvanced(Input)
' Access the first table found in the document
If Result.Tables.Any() Then ' Ensure there's at least one table detected
Dim firstTable = Result.Tables.First()
' Extract the cell information from the first table
Dim cellList = firstTable.CellInfos
' Iterate over each cell and display its text content and details
For Each cell In cellList
Console.WriteLine($"Cell Text: {cell.Text}")
Console.WriteLine($"Position: {cell.Bounds}")
Console.WriteLine($"Size: {cell.Size}")
Next cell
End If
End Sub
End Class