diff --git a/README.md b/README.md index 4f676cf..650869f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,85 @@ # windows_media_ocr_cli -> 🔎 OCR CLI that outputs structured data with bounding rects using local Windows OCR API + +> 🔎 Fast OCR CLI for Windows: outputs structured data (bounding rects, text) using the local Windows OCR API ![image](https://github.com/user-attachments/assets/3a832c94-5030-41d8-9454-6869ec7cfcc1) -## How to install +--- + +## Features + +- OCR image files or image data from stdin +- Outputs results as JSON (with bounding boxes) or plain text +- Supports multiple languages (default: en-US) +- Fast, local processing (no cloud required) +- Simple CLI interface + +## Requirements + +- Windows 10/11 +- .NET Framework 4.8.1 or later +- Windows OCR API support (built-in on modern Windows) + +## Installation Download the latest executable from [Releases](https://github.com/Akronae/windows_media_ocr_cli/releases) -## How to use +Or build from source: -```bash +```sh +git clone https://github.com/Akronae/windows_media_ocr_cli.git +cd windows_media_ocr_cli +dotnet build +``` + +## Usage + +### Basic examples + +```sh +# OCR from file windows_media_ocr_cli.exe --file image.png + +# OCR from stdin (pipe image data) +type image.png | windows_media_ocr_cli.exe --stdin ``` -To see all options -```bash -windows_media_ocr_cli.exe -h + +### All options + +```sh +windows_media_ocr_cli.exe --file [--language ] [--mode ] +windows_media_ocr_cli.exe --stdin [--language ] [--mode ] ``` + +| Option | Description | Default | +| ---------- | ------------------------------------------------- | ------- | +| --file | Path to image file | | +| --stdin | Read image data from stdin | false | +| --language | OCR language (e.g. en-US, fr-FR, zh-CN) | en-US | +| --mode | Output format: json (with bounding boxes) or text | json | + +### Output formats + +- **json**: Full OCR result, including bounding rectangles and lines/words. +- **text**: Plain text output (lines joined, no structure). + +## Troubleshooting + +- Make sure you are running on Windows 10/11 with .NET Framework 4.8.1+ installed. +- If you see errors about missing OCR API, update your Windows system. +- For large images, prefer file input over stdin for performance. + +## FAQ + +**Q: Can I use this on Linux or macOS?** +A: No, this tool relies on the Windows OCR API. + +**Q: How do I specify a different language?** +A: Use `--language `, e.g. `--language fr-FR`. + +**Q: What image formats are supported?** +A: Any format supported by Windows Imaging APIs (PNG, JPEG, BMP, etc). + +## License + +MIT diff --git a/src/Program.cs b/src/Program.cs index f5888fb..1dd5835 100644 --- a/src/Program.cs +++ b/src/Program.cs @@ -1,23 +1,28 @@ using System; +using System.CommandLine; +using System.CommandLine.Completions; using System.IO; +using System.Runtime.InteropServices.WindowsRuntime; using System.Text; +using System.Text.Json; using System.Threading.Tasks; +using Windows.Globalization; using Windows.Graphics.Imaging; +using Windows.Media.Ocr; using Windows.Storage; using Windows.Storage.Streams; -using System.Text.Json; -using Windows.Media.Ocr; -using Windows.Globalization; -using System.CommandLine; -using System.CommandLine.Completions; - var fileOption = new Option( name: "--file", description: "The file to read and display on the console." -) { - IsRequired = true -}; +); + +var stdinOption = new Option( + name: "--stdin", + description: "Read image data from stdin.", + getDefaultValue: () => false +); + var languageOption = new Option( name: "--language", description: "The language that should be used during OCR.", @@ -29,21 +34,59 @@ getDefaultValue: () => OcrOutputMode.json ); - var rootCommand = new RootCommand("Start an OCR analysis using Windows local OcrEngine.") { fileOption, + stdinOption, languageOption, - modeOption + modeOption, }; -rootCommand.SetHandler(Handler, fileOption, languageOption, modeOption); -return await rootCommand.InvokeAsync(args); +rootCommand.AddValidator(cmdResult => +{ + var file = cmdResult.GetValueForOption(fileOption); + var stdin = cmdResult.GetValueForOption(stdinOption); + if (string.IsNullOrEmpty(file) && !stdin) + { + cmdResult.ErrorMessage = "Either --file or --stdin must be provided."; + } +}); +rootCommand.SetHandler(Handler, fileOption, stdinOption, languageOption, modeOption); + +return await rootCommand.InvokeAsync(args); -static async Task Handler(string filepath, string language, OcrOutputMode mode) +static async Task Handler(string filepath, bool useStdin, string language, OcrOutputMode mode) { - var result = await RecognizeAsync(filepath, language); + OcrResult result; + + if (useStdin) + { + using var memoryStream = new MemoryStream(); + await Console.OpenStandardInput().CopyToAsync(memoryStream); + memoryStream.Position = 0; + + using var randomAccessStream = new InMemoryRandomAccessStream(); + await randomAccessStream.WriteAsync(memoryStream.ToArray().AsBuffer()); + randomAccessStream.Seek(0); + + result = await RecognizeAsync(randomAccessStream, language); + } + else if (!string.IsNullOrEmpty(filepath)) + { + var path = Path.GetFullPath(filepath); + var storageFile = await StorageFile.GetFileFromPathAsync(path); + using var randomAccessStream = await storageFile.OpenReadAsync(); + result = await RecognizeAsync(randomAccessStream, language); + } + else + { + // This should be unreachable due to command-line validation + throw new InvalidOperationException( + "Unreachable code: either --file or --stdin must be provided." + ); + } + var txt = ""; if (mode == OcrOutputMode.json) @@ -74,12 +117,8 @@ static async Task Handler(string filepath, string language, OcrOutputMode mode) Console.WriteLine(txt); } - -static async Task RecognizeAsync(string filepath, string language) +static async Task RecognizeAsync(IRandomAccessStream randomAccessStream, string language) { - var path = Path.GetFullPath(filepath); - var storageFile = await StorageFile.GetFileFromPathAsync(path); - using var randomAccessStream = await storageFile.OpenReadAsync(); var decoder = await BitmapDecoder.CreateAsync(randomAccessStream); using var softwareBitmap = await decoder.GetSoftwareBitmapAsync( BitmapPixelFormat.Bgra8, @@ -107,5 +146,5 @@ static async Task RecognizeAsync(string filepath, string language) enum OcrOutputMode { json, - text + text, } diff --git a/windows_media_ocr_cli.csproj b/windows_media_ocr_cli.csproj index 5abcb91..b6a9995 100644 --- a/windows_media_ocr_cli.csproj +++ b/windows_media_ocr_cli.csproj @@ -8,7 +8,6 @@ 0.0.1 true -