danieldk HF Staff commited on
Commit
84cf777
·
verified ·
1 Parent(s): ea80dd0

Build uploaded using `kernels`.

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ build/torch210-metal-aarch64-darwin/_bitsandbytes_mps_c31f916.abi3.so filter=lfs diff=lfs merge=lfs -text
37
+ build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_c31f916.abi3.so filter=lfs diff=lfs merge=lfs -text
build/torch210-metal-aarch64-darwin/__init__.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+
3
+ import torch
4
+
5
+ from ._ops import ops
6
+
7
+ # Quant type constants (match bitsandbytes DataType_t)
8
+ FP4 = 1
9
+ NF4 = 2
10
+
11
+
12
+ def quantize_4bit(
13
+ input: torch.Tensor,
14
+ blocksize: int = 64,
15
+ quant_type: int = NF4,
16
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
17
+ """Blockwise 4-bit quantization using NF4 or FP4 codebook.
18
+
19
+ Args:
20
+ input: Input tensor on MPS device (float16, bfloat16, or float32).
21
+ blocksize: Number of elements per quantization block (64 or 128).
22
+ quant_type: FP4 (1) or NF4 (2).
23
+
24
+ Returns:
25
+ Tuple of (packed, absmax):
26
+ packed: uint8 tensor of packed 4-bit values [numel/2].
27
+ absmax: float32 tensor of per-block max absolute values.
28
+ """
29
+ return ops.bnb_quantize_4bit(input, blocksize, quant_type)
30
+
31
+
32
+ def dequantize_4bit(
33
+ packed: torch.Tensor,
34
+ absmax: torch.Tensor,
35
+ blocksize: int = 64,
36
+ quant_type: int = NF4,
37
+ numel: int = -1,
38
+ output_dtype: torch.dtype = torch.float16,
39
+ ) -> torch.Tensor:
40
+ """Blockwise 4-bit dequantization using NF4 or FP4 codebook.
41
+
42
+ Args:
43
+ packed: uint8 tensor of packed 4-bit values.
44
+ absmax: float32 tensor of per-block max absolute values.
45
+ blocksize: Number of elements per quantization block (64 or 128).
46
+ quant_type: FP4 (1) or NF4 (2).
47
+ numel: Number of elements in the original tensor.
48
+ If -1, inferred as packed.numel() * 2.
49
+ output_dtype: Output scalar type.
50
+
51
+ Returns:
52
+ Dequantized tensor.
53
+ """
54
+ if numel < 0:
55
+ numel = packed.numel() * 2
56
+ return ops.bnb_dequantize_4bit(
57
+ packed, absmax, blocksize, quant_type, numel, output_dtype
58
+ )
59
+
60
+
61
+ def gemv_4bit(
62
+ x: torch.Tensor,
63
+ w: torch.Tensor,
64
+ absmax: torch.Tensor,
65
+ output_features: int,
66
+ blocksize: int = 64,
67
+ quant_type: int = NF4,
68
+ ) -> torch.Tensor:
69
+ """Fused matrix-vector multiply with 4-bit quantized weights.
70
+
71
+ Computes y = dequant(W) @ x, where W is blockwise NF4/FP4 quantized.
72
+
73
+ Args:
74
+ x: Input vector [..., K] on MPS device.
75
+ w: Packed weight matrix [N, K/2] (uint8) on MPS device.
76
+ absmax: Per-block scales [N, ceil(K/blocksize)] (float32).
77
+ output_features: Number of output features (N).
78
+ blocksize: Quantization block size (64 or 128).
79
+ quant_type: FP4 (1) or NF4 (2).
80
+
81
+ Returns:
82
+ Output tensor [..., N].
83
+ """
84
+ return ops.bnb_gemv_4bit(x, w, absmax, blocksize, quant_type, output_features)
85
+
86
+
87
+ def gemm_4bit(
88
+ x: torch.Tensor,
89
+ w: torch.Tensor,
90
+ absmax: torch.Tensor,
91
+ output_features: int,
92
+ blocksize: int = 64,
93
+ quant_type: int = NF4,
94
+ ) -> torch.Tensor:
95
+ """Fused matrix-matrix multiply with 4-bit quantized transposed weights.
96
+
97
+ Computes Y = X @ dequant(W).T, where W is blockwise NF4/FP4 quantized.
98
+
99
+ Args:
100
+ x: Input matrix [..., M, K] on MPS device.
101
+ w: Packed weight matrix [N, K/2] (uint8) on MPS device.
102
+ absmax: Per-block scales [N, ceil(K/blocksize)] (float32).
103
+ output_features: Number of output features (N).
104
+ blocksize: Quantization block size (64 or 128).
105
+ quant_type: FP4 (1) or NF4 (2).
106
+
107
+ Returns:
108
+ Output tensor [..., M, N].
109
+ """
110
+ return ops.bnb_gemm_4bit(x, w, absmax, blocksize, quant_type, output_features)
111
+
112
+
113
+ def linear_4bit(
114
+ x: torch.Tensor,
115
+ w: torch.Tensor,
116
+ absmax: torch.Tensor,
117
+ output_features: int,
118
+ blocksize: int = 64,
119
+ quant_type: int = NF4,
120
+ bias: Optional[torch.Tensor] = None,
121
+ ) -> torch.Tensor:
122
+ """4-bit quantized linear layer (auto-selects GEMV or GEMM).
123
+
124
+ Args:
125
+ x: Input tensor on MPS device.
126
+ w: Packed weight [N, K/2] (uint8).
127
+ absmax: Scales [N, ceil(K/blocksize)] (float32).
128
+ output_features: N.
129
+ blocksize: 64 or 128.
130
+ quant_type: FP4 (1) or NF4 (2).
131
+ bias: Optional bias [N].
132
+
133
+ Returns:
134
+ Output tensor.
135
+ """
136
+ input_1d = x.dim() == 1
137
+ if input_1d or (x.dim() >= 2 and x.size(-2) == 1):
138
+ x_flat = x.view(x.size(-1)) if input_1d else x.squeeze(-2)
139
+ y = gemv_4bit(
140
+ x_flat,
141
+ w,
142
+ absmax,
143
+ output_features,
144
+ blocksize,
145
+ quant_type,
146
+ )
147
+ if input_1d:
148
+ y = y.squeeze(0)
149
+ elif x.dim() >= 2:
150
+ y = y.unsqueeze(-2)
151
+ else:
152
+ y = gemm_4bit(x, w, absmax, output_features, blocksize, quant_type)
153
+
154
+ if bias is not None:
155
+ y = y + bias
156
+
157
+ return y
158
+
159
+ __all__ = [
160
+ "quantize_4bit",
161
+ "dequantize_4bit",
162
+ "gemv_4bit",
163
+ "gemm_4bit",
164
+ "linear_4bit",
165
+ ]
build/torch210-metal-aarch64-darwin/_bitsandbytes_mps_c31f916.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03e84d69f0649570e560d12267b9e1ef8cd187f8bb13a737f0a28d40af567259
3
+ size 845120
build/torch210-metal-aarch64-darwin/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _bitsandbytes_mps_c31f916
3
+ ops = torch.ops._bitsandbytes_mps_c31f916
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_bitsandbytes_mps_c31f916::{op_name}"
build/torch210-metal-aarch64-darwin/bitsandbytes_mps/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import sys
3
+
4
+ import importlib
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch210-metal-aarch64-darwin/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "python-depends": []
3
+ }
build/torch29-metal-aarch64-darwin/__init__.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple
2
+
3
+ import torch
4
+
5
+ from ._ops import ops
6
+
7
+ # Quant type constants (match bitsandbytes DataType_t)
8
+ FP4 = 1
9
+ NF4 = 2
10
+
11
+
12
+ def quantize_4bit(
13
+ input: torch.Tensor,
14
+ blocksize: int = 64,
15
+ quant_type: int = NF4,
16
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
17
+ """Blockwise 4-bit quantization using NF4 or FP4 codebook.
18
+
19
+ Args:
20
+ input: Input tensor on MPS device (float16, bfloat16, or float32).
21
+ blocksize: Number of elements per quantization block (64 or 128).
22
+ quant_type: FP4 (1) or NF4 (2).
23
+
24
+ Returns:
25
+ Tuple of (packed, absmax):
26
+ packed: uint8 tensor of packed 4-bit values [numel/2].
27
+ absmax: float32 tensor of per-block max absolute values.
28
+ """
29
+ return ops.bnb_quantize_4bit(input, blocksize, quant_type)
30
+
31
+
32
+ def dequantize_4bit(
33
+ packed: torch.Tensor,
34
+ absmax: torch.Tensor,
35
+ blocksize: int = 64,
36
+ quant_type: int = NF4,
37
+ numel: int = -1,
38
+ output_dtype: torch.dtype = torch.float16,
39
+ ) -> torch.Tensor:
40
+ """Blockwise 4-bit dequantization using NF4 or FP4 codebook.
41
+
42
+ Args:
43
+ packed: uint8 tensor of packed 4-bit values.
44
+ absmax: float32 tensor of per-block max absolute values.
45
+ blocksize: Number of elements per quantization block (64 or 128).
46
+ quant_type: FP4 (1) or NF4 (2).
47
+ numel: Number of elements in the original tensor.
48
+ If -1, inferred as packed.numel() * 2.
49
+ output_dtype: Output scalar type.
50
+
51
+ Returns:
52
+ Dequantized tensor.
53
+ """
54
+ if numel < 0:
55
+ numel = packed.numel() * 2
56
+ return ops.bnb_dequantize_4bit(
57
+ packed, absmax, blocksize, quant_type, numel, output_dtype
58
+ )
59
+
60
+
61
+ def gemv_4bit(
62
+ x: torch.Tensor,
63
+ w: torch.Tensor,
64
+ absmax: torch.Tensor,
65
+ output_features: int,
66
+ blocksize: int = 64,
67
+ quant_type: int = NF4,
68
+ ) -> torch.Tensor:
69
+ """Fused matrix-vector multiply with 4-bit quantized weights.
70
+
71
+ Computes y = dequant(W) @ x, where W is blockwise NF4/FP4 quantized.
72
+
73
+ Args:
74
+ x: Input vector [..., K] on MPS device.
75
+ w: Packed weight matrix [N, K/2] (uint8) on MPS device.
76
+ absmax: Per-block scales [N, ceil(K/blocksize)] (float32).
77
+ output_features: Number of output features (N).
78
+ blocksize: Quantization block size (64 or 128).
79
+ quant_type: FP4 (1) or NF4 (2).
80
+
81
+ Returns:
82
+ Output tensor [..., N].
83
+ """
84
+ return ops.bnb_gemv_4bit(x, w, absmax, blocksize, quant_type, output_features)
85
+
86
+
87
+ def gemm_4bit(
88
+ x: torch.Tensor,
89
+ w: torch.Tensor,
90
+ absmax: torch.Tensor,
91
+ output_features: int,
92
+ blocksize: int = 64,
93
+ quant_type: int = NF4,
94
+ ) -> torch.Tensor:
95
+ """Fused matrix-matrix multiply with 4-bit quantized transposed weights.
96
+
97
+ Computes Y = X @ dequant(W).T, where W is blockwise NF4/FP4 quantized.
98
+
99
+ Args:
100
+ x: Input matrix [..., M, K] on MPS device.
101
+ w: Packed weight matrix [N, K/2] (uint8) on MPS device.
102
+ absmax: Per-block scales [N, ceil(K/blocksize)] (float32).
103
+ output_features: Number of output features (N).
104
+ blocksize: Quantization block size (64 or 128).
105
+ quant_type: FP4 (1) or NF4 (2).
106
+
107
+ Returns:
108
+ Output tensor [..., M, N].
109
+ """
110
+ return ops.bnb_gemm_4bit(x, w, absmax, blocksize, quant_type, output_features)
111
+
112
+
113
+ def linear_4bit(
114
+ x: torch.Tensor,
115
+ w: torch.Tensor,
116
+ absmax: torch.Tensor,
117
+ output_features: int,
118
+ blocksize: int = 64,
119
+ quant_type: int = NF4,
120
+ bias: Optional[torch.Tensor] = None,
121
+ ) -> torch.Tensor:
122
+ """4-bit quantized linear layer (auto-selects GEMV or GEMM).
123
+
124
+ Args:
125
+ x: Input tensor on MPS device.
126
+ w: Packed weight [N, K/2] (uint8).
127
+ absmax: Scales [N, ceil(K/blocksize)] (float32).
128
+ output_features: N.
129
+ blocksize: 64 or 128.
130
+ quant_type: FP4 (1) or NF4 (2).
131
+ bias: Optional bias [N].
132
+
133
+ Returns:
134
+ Output tensor.
135
+ """
136
+ input_1d = x.dim() == 1
137
+ if input_1d or (x.dim() >= 2 and x.size(-2) == 1):
138
+ x_flat = x.view(x.size(-1)) if input_1d else x.squeeze(-2)
139
+ y = gemv_4bit(
140
+ x_flat,
141
+ w,
142
+ absmax,
143
+ output_features,
144
+ blocksize,
145
+ quant_type,
146
+ )
147
+ if input_1d:
148
+ y = y.squeeze(0)
149
+ elif x.dim() >= 2:
150
+ y = y.unsqueeze(-2)
151
+ else:
152
+ y = gemm_4bit(x, w, absmax, output_features, blocksize, quant_type)
153
+
154
+ if bias is not None:
155
+ y = y + bias
156
+
157
+ return y
158
+
159
+ __all__ = [
160
+ "quantize_4bit",
161
+ "dequantize_4bit",
162
+ "gemv_4bit",
163
+ "gemm_4bit",
164
+ "linear_4bit",
165
+ ]
build/torch29-metal-aarch64-darwin/_bitsandbytes_mps_c31f916.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ab4506b4b2d6581d5a13e3824b8df1a49da98ee95166e3b85f059f51256e41
3
+ size 844464
build/torch29-metal-aarch64-darwin/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _bitsandbytes_mps_c31f916
3
+ ops = torch.ops._bitsandbytes_mps_c31f916
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_bitsandbytes_mps_c31f916::{op_name}"
build/torch29-metal-aarch64-darwin/bitsandbytes_mps/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import sys
3
+
4
+ import importlib
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch29-metal-aarch64-darwin/metadata.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "python-depends": []
3
+ }