Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 74 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,85 @@
# windows_media_ocr_cli
> 🔎 OCR CLI that outputs structured data with bounding rects using local Windows OCR API

> 🔎 Fast OCR CLI for Windows: outputs structured data (bounding rects, text) using the local Windows OCR API

![image](https://github.com/user-attachments/assets/3a832c94-5030-41d8-9454-6869ec7cfcc1)

## How to install
---

## Features

- OCR image files or image data from stdin
- Outputs results as JSON (with bounding boxes) or plain text
- Supports multiple languages (default: en-US)
- Fast, local processing (no cloud required)
- Simple CLI interface

## Requirements

- Windows 10/11
- .NET Framework 4.8.1 or later
- Windows OCR API support (built-in on modern Windows)

## Installation

Download the latest executable from [Releases](https://github.com/Akronae/windows_media_ocr_cli/releases)

## How to use
Or build from source:

```bash
```sh
git clone https://github.com/Akronae/windows_media_ocr_cli.git
cd windows_media_ocr_cli
dotnet build
```

## Usage

### Basic examples

```sh
# OCR from file
windows_media_ocr_cli.exe --file image.png

# OCR from stdin (pipe image data)
type image.png | windows_media_ocr_cli.exe --stdin
```
To see all options
```bash
windows_media_ocr_cli.exe -h

### All options

```sh
windows_media_ocr_cli.exe --file <image> [--language <lang>] [--mode <json|text>]
windows_media_ocr_cli.exe --stdin [--language <lang>] [--mode <json|text>]
```

| Option | Description | Default |
| ---------- | ------------------------------------------------- | ------- |
| --file | Path to image file | |
| --stdin | Read image data from stdin | false |
| --language | OCR language (e.g. en-US, fr-FR, zh-CN) | en-US |
| --mode | Output format: json (with bounding boxes) or text | json |

### Output formats

- **json**: Full OCR result, including bounding rectangles and lines/words.
- **text**: Plain text output (lines joined, no structure).

## Troubleshooting

- Make sure you are running on Windows 10/11 with .NET Framework 4.8.1+ installed.
- If you see errors about missing OCR API, update your Windows system.
- For large images, prefer file input over stdin for performance.

## FAQ

**Q: Can I use this on Linux or macOS?**
A: No, this tool relies on the Windows OCR API.

**Q: How do I specify a different language?**
A: Use `--language <lang>`, e.g. `--language fr-FR`.

**Q: What image formats are supported?**
A: Any format supported by Windows Imaging APIs (PNG, JPEG, BMP, etc).

## License

MIT
81 changes: 60 additions & 21 deletions src/Program.cs
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
using System;
using System.CommandLine;
using System.CommandLine.Completions;
using System.IO;
using System.Runtime.InteropServices.WindowsRuntime;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using Windows.Globalization;
using Windows.Graphics.Imaging;
using Windows.Media.Ocr;
using Windows.Storage;
using Windows.Storage.Streams;
using System.Text.Json;
using Windows.Media.Ocr;
using Windows.Globalization;
using System.CommandLine;
using System.CommandLine.Completions;


var fileOption = new Option<string>(
name: "--file",
description: "The file to read and display on the console."
) {
IsRequired = true
};
);

var stdinOption = new Option<bool>(
name: "--stdin",
description: "Read image data from stdin.",
getDefaultValue: () => false
);

var languageOption = new Option<string>(
name: "--language",
description: "The language that should be used during OCR.",
Expand All @@ -29,21 +34,59 @@
getDefaultValue: () => OcrOutputMode.json
);


var rootCommand = new RootCommand("Start an OCR analysis using Windows local OcrEngine.")
{
fileOption,
stdinOption,
languageOption,
modeOption
modeOption,
};
rootCommand.SetHandler(Handler, fileOption, languageOption, modeOption);

return await rootCommand.InvokeAsync(args);
rootCommand.AddValidator(cmdResult =>
{
var file = cmdResult.GetValueForOption(fileOption);
var stdin = cmdResult.GetValueForOption(stdinOption);
if (string.IsNullOrEmpty(file) && !stdin)
{
cmdResult.ErrorMessage = "Either --file or --stdin must be provided.";
}
});

rootCommand.SetHandler(Handler, fileOption, stdinOption, languageOption, modeOption);

return await rootCommand.InvokeAsync(args);

static async Task Handler(string filepath, string language, OcrOutputMode mode)
static async Task Handler(string filepath, bool useStdin, string language, OcrOutputMode mode)
{
var result = await RecognizeAsync(filepath, language);
OcrResult result;

if (useStdin)
{
using var memoryStream = new MemoryStream();
await Console.OpenStandardInput().CopyToAsync(memoryStream);
memoryStream.Position = 0;

using var randomAccessStream = new InMemoryRandomAccessStream();
await randomAccessStream.WriteAsync(memoryStream.ToArray().AsBuffer());
randomAccessStream.Seek(0);

result = await RecognizeAsync(randomAccessStream, language);
}
else if (!string.IsNullOrEmpty(filepath))
{
var path = Path.GetFullPath(filepath);
var storageFile = await StorageFile.GetFileFromPathAsync(path);
using var randomAccessStream = await storageFile.OpenReadAsync();
result = await RecognizeAsync(randomAccessStream, language);
}
else
{
// This should be unreachable due to command-line validation
throw new InvalidOperationException(
"Unreachable code: either --file or --stdin must be provided."
);
}

var txt = "";

if (mode == OcrOutputMode.json)
Expand Down Expand Up @@ -74,12 +117,8 @@ static async Task Handler(string filepath, string language, OcrOutputMode mode)
Console.WriteLine(txt);
}


static async Task<OcrResult> RecognizeAsync(string filepath, string language)
static async Task<OcrResult> RecognizeAsync(IRandomAccessStream randomAccessStream, string language)
{
var path = Path.GetFullPath(filepath);
var storageFile = await StorageFile.GetFileFromPathAsync(path);
using var randomAccessStream = await storageFile.OpenReadAsync();
var decoder = await BitmapDecoder.CreateAsync(randomAccessStream);
using var softwareBitmap = await decoder.GetSoftwareBitmapAsync(
BitmapPixelFormat.Bgra8,
Expand Down Expand Up @@ -107,5 +146,5 @@ static async Task<OcrResult> RecognizeAsync(string filepath, string language)
enum OcrOutputMode
{
json,
text
text,
}
1 change: 0 additions & 1 deletion windows_media_ocr_cli.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
<Version>0.0.1</Version>
<RestorePackagesWithLockFile>true</RestorePackagesWithLockFile>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Windows.SDK.Contracts" Version="10.0.26100.1742" />
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
Expand Down