Instructions to use mat50013/Qianfan-OCR-MLX-BF16 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mat50013/Qianfan-OCR-MLX-BF16 with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir Qianfan-OCR-MLX-BF16 mat50013/Qianfan-OCR-MLX-BF16
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
Qianfan-OCR โ MLX BF16
MLX conversion of baidu/Qianfan-OCR for Apple Silicon.
| Property | Value |
|---|---|
| Base model | baidu/Qianfan-OCR |
| Format | MLX BF16 |
Usage
from PIL import Image
from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
resized_img = image.resize((target_width, target_height))
processed_images = []
tiles_per_row = target_width // image_size
for i in range(blocks):
box = (
(i % tiles_per_row) * image_size,
(i // tiles_per_row) * image_size,
((i % tiles_per_row) + 1) * image_size,
((i // tiles_per_row) + 1) * image_size,
)
split_img = resized_img.crop(box)
processed_images.append(split_img)
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image_tiles(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert("RGB")
return dynamic_preprocess(
image,
image_size=input_size,
use_thumbnail=True,
max_num=max_num,
)
MODEL_PATH = "mat50013/Qianfan-OCR-MLX-BF16"
IMAGE_PATH = "./documents/pic.jpeg"
model, processor = load(MODEL_PATH)
images = load_image_tiles(IMAGE_PATH, input_size=448, max_num=12)
prompt = "Parse this document to Markdown."
formatted_prompt = apply_chat_template(
processor,
model.config,
prompt,
num_images=len(images),
)
result = generate(
model,
processor,
formatted_prompt,
image=images,
max_tokens=16384,
verbose=False,
)
print(result.text if hasattr(result, "text") else result)
- Downloads last month
- 117
Model size
5B params
Tensor type
BF16
ยท
Hardware compatibility
Log In to add your hardware
Quantized
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐ Ask for provider support
Model tree for mat50013/Qianfan-OCR-MLX-BF16
Base model
baidu/Qianfan-OCR