Text Generation
Transformers
Safetensors
French
English
binaryllm
binary-level
bit-level
causal-lm
tokenizer-free
base2
binary
TinyTransformerLM
custom_code
Instructions to use PhysiQuanty/Binary-LLM-POC with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use PhysiQuanty/Binary-LLM-POC with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="PhysiQuanty/Binary-LLM-POC", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("PhysiQuanty/Binary-LLM-POC", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use PhysiQuanty/Binary-LLM-POC with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "PhysiQuanty/Binary-LLM-POC" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PhysiQuanty/Binary-LLM-POC", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/PhysiQuanty/Binary-LLM-POC
- SGLang
How to use PhysiQuanty/Binary-LLM-POC with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "PhysiQuanty/Binary-LLM-POC" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PhysiQuanty/Binary-LLM-POC", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "PhysiQuanty/Binary-LLM-POC" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PhysiQuanty/Binary-LLM-POC", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use PhysiQuanty/Binary-LLM-POC with Docker Model Runner:
docker model run hf.co/PhysiQuanty/Binary-LLM-POC
| import math | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from transformers import PreTrainedModel | |
| from transformers.modeling_outputs import CausalLMOutput | |
| from .configuration_binaryllm import BinaryLLMConfig | |
| class PositionalEncoding(nn.Module): | |
| """ | |
| Sinusoidal positional encoding, stocké en fp32, | |
| puis casté au dtype de x à chaque forward. | |
| """ | |
| def __init__(self, d_model: int, max_len: int) -> None: | |
| super().__init__() | |
| pe = torch.zeros(max_len, d_model, dtype=torch.float32) | |
| position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) | |
| div_term = torch.exp( | |
| torch.arange(0, d_model, 2, dtype=torch.float32) * (-torch.log(torch.tensor(10000.0)) / d_model) | |
| ) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| pe = pe.unsqueeze(0) # (1, max_len, d_model) | |
| self.register_buffer("pe", pe, persistent=False) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| t = x.size(1) | |
| pe = self.pe[:, :t, :] | |
| pe = pe.to(device=x.device, dtype=x.dtype) | |
| return x + pe | |
| class _InnerCfg: | |
| block_size: int | |
| embed_dim: int | |
| vocab_size: int | |
| num_heads: int | |
| num_layers: int | |
| ff_hidden_dim: int | |
| dropout: float | |
| layernorm_dim: Optional[int] = None | |
| head_dim: Optional[int] = None | |
| class TinyTransformerLM(nn.Module): | |
| def __init__(self, cfg: _InnerCfg) -> None: | |
| super().__init__() | |
| self.cfg = cfg | |
| vocab_size = cfg.vocab_size | |
| self.tok_embed = nn.Embedding(vocab_size, cfg.embed_dim) | |
| self.pos_encoding = PositionalEncoding(cfg.embed_dim, cfg.block_size) | |
| encoder_layer = nn.TransformerEncoderLayer( | |
| d_model=cfg.embed_dim, | |
| nhead=cfg.num_heads, | |
| dim_feedforward=cfg.ff_hidden_dim, | |
| dropout=cfg.dropout, | |
| activation="gelu", | |
| batch_first=True, | |
| ) | |
| self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=cfg.num_layers) | |
| ln_dim = cfg.layernorm_dim or cfg.embed_dim | |
| head_dim = cfg.head_dim or ln_dim | |
| self.pre_ln_proj: Optional[nn.Linear] = None | |
| if ln_dim != cfg.embed_dim: | |
| self.pre_ln_proj = nn.Linear(cfg.embed_dim, ln_dim) | |
| self.ln = nn.LayerNorm(ln_dim) | |
| self.head_pre: Optional[nn.Linear] = None | |
| if head_dim != ln_dim: | |
| self.head_pre = nn.Linear(ln_dim, head_dim) | |
| self.head = nn.Linear(head_dim, vocab_size, bias=False) | |
| # weight tying seulement si parfait alignement | |
| if self.pre_ln_proj is None and self.head_pre is None and head_dim == cfg.embed_dim: | |
| self.head.weight = self.tok_embed.weight | |
| causal = torch.triu(torch.ones(cfg.block_size, cfg.block_size, dtype=torch.bool), diagonal=1) | |
| self.register_buffer("causal_mask", causal, persistent=False) | |
| def forward(self, tokens: torch.Tensor, padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor: | |
| x = self.tok_embed(tokens) | |
| x = self.pos_encoding(x) | |
| seq_len = tokens.size(1) | |
| attn_mask = self.causal_mask[:seq_len, :seq_len].to(device=tokens.device) | |
| if padding_mask is not None: | |
| padding_mask = padding_mask[:, :seq_len].to(device=tokens.device, dtype=torch.bool) | |
| x = self.encoder(x, mask=attn_mask, src_key_padding_mask=padding_mask) | |
| if self.pre_ln_proj is not None: | |
| x = self.pre_ln_proj(x) | |
| x = self.ln(x) | |
| if self.head_pre is not None: | |
| x = self.head_pre(x) | |
| return self.head(x) | |
| class BinaryLLMForCausalLM(PreTrainedModel): | |
| config_class = BinaryLLMConfig | |
| main_input_name = "input_ids" | |
| def __init__(self, config: BinaryLLMConfig): | |
| super().__init__(config) | |
| inner = _InnerCfg( | |
| block_size=int(config.max_position_embeddings), | |
| embed_dim=int(config.hidden_size), | |
| vocab_size=int(config.vocab_size), | |
| num_heads=int(config.num_attention_heads), | |
| num_layers=int(config.num_hidden_layers), | |
| ff_hidden_dim=int(config.intermediate_size), | |
| dropout=float(getattr(config, "dropout", 0.0)), | |
| layernorm_dim=None, | |
| head_dim=None, | |
| ) | |
| self.model = TinyTransformerLM(inner) | |
| self.post_init() | |
| def forward( | |
| self, | |
| input_ids: torch.LongTensor, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| **kwargs, | |
| ) -> CausalLMOutput: | |
| padding_mask = None | |
| if attention_mask is not None: | |
| padding_mask = ~attention_mask.to(torch.bool) # True = ignore | |
| logits = self.model(input_ids, padding_mask=padding_mask) | |
| loss = None | |
| if labels is not None: | |
| shift_logits = logits[:, :-1, :].contiguous() | |
| shift_labels = labels[:, 1:].contiguous() | |
| loss = F.cross_entropy( | |
| shift_logits.view(-1, self.config.vocab_size), | |
| shift_labels.view(-1), | |
| ignore_index=-100, | |
| ) | |
| return CausalLMOutput(loss=loss, logits=logits) | |