From 8cf19d60dc93809db8e51fedc811595eed9134c5 Mon Sep 17 00:00:00 2001 From: Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Date: Fri, 20 Oct 2023 06:19:40 -0500 Subject: [PATCH] gguf : support big endian platform (#3552) * check whether platform is 390x if yes->do not import immintrin.h * support s390x big endian * support --bigendian option for s390x 1. verified with baichuan7b-chat with float 16 on s390x 2. verified with baichuan7b-chat 3. verified with chinese-alpaca-2-13b-f16 * update format based on editor-config checker result * Update convert-baichuan-hf-to-gguf.py * 1. check in ggml.c if endianess is not match 2. update GGUF version 3. change get_pack_prefix to property 4. update information log * always use "GGUF" as beginng of GGUF file * Compare "GGUF" with file header char by char 1. Set GGUF_MAGIC to "GGUF" string instead of int value 2. Compare "GGUF" char by char to ensure its byte order 3. Move bytes swap code from convert.py to gguf.py write_tensor_data --------- Co-authored-by: Georgi Gerganov --- convert-baichuan-hf-to-gguf.py | 8 +- convert.py | 20 +++-- .../convert-llama2c-to-ggml.cpp | 2 +- ggml.c | 19 +++-- ggml.h | 5 +- gguf-py/gguf/gguf.py | 73 ++++++++++++------- gguf-py/pyproject.toml | 2 +- k_quants.c | 2 +- tests/test-double-float.cpp | 2 + 9 files changed, 84 insertions(+), 49 deletions(-) diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py index 513a7516a..a1783f71f 100755 --- a/convert-baichuan-hf-to-gguf.py +++ b/convert-baichuan-hf-to-gguf.py @@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace: "ftype", type=int, choices=[0, 1], default=1, nargs='?', help="output format - use 0 for float32, 1 for float16", ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") return parser.parse_args() args = parse_args() @@ -86,6 +87,11 @@ if not dir_model.is_dir(): print(f'Error: {args.model} is not a directory', file = sys.stderr) sys.exit(1) +endianess = gguf.GGUFEndian.LITTLE +if args.bigendian: + endianess = gguf.GGUFEndian.BIG +endianess_str = "Big Endian" if args.bigendian else "Little Endian" +print(f"gguf: Conversion Endianess {endianess}") # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM": num_parts = count_model_parts(dir_model) print(f"num_parts:{num_parts}\n") ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) print("gguf: get model metadata") diff --git a/convert.py b/convert.py index e9b08d344..24da25efc 100755 --- a/convert.py +++ b/convert.py @@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: class OutputFile: - def __init__(self, fname_out: Path) -> None: - self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -875,10 +875,10 @@ class OutputFile: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) - of = OutputFile(fname_out) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_arch(params) @@ -903,10 +903,10 @@ class OutputFile: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) - of = OutputFile(fname_out) + of = OutputFile(fname_out, endianess=endianess) # meta data of.add_meta_arch(params) @@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) - args = parser.parse_args(args_in) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + args = parser.parse_args(args_in) if args.dump_single: model_plus = lazy_load_file(args.model) do_dump_model(model_plus) @@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None: if args.dump: do_dump_model(model_plus) return + endianess = gguf.GGUFEndian.LITTLE + if args.bigendian: + endianess = gguf.GGUFEndian.BIG params = Params.load(model_plus) if params.n_ctx == -1: @@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) print(f"Wrote {outfile}") diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index c291f0adf..cae3bf3c3 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) { if (file.size < 4) { return false; } - uint32_t magic = file.read_u32(); + std::string magic = file.read_string(4); return magic == GGUF_MAGIC; } diff --git a/ggml.c b/ggml.c index ed157aab0..49f3b7aba 100644 --- a/ggml.c +++ b/ggml.c @@ -20845,7 +20845,7 @@ struct gguf_kv { }; struct gguf_header { - uint32_t magic; + char magic[4]; uint32_t version; uint64_t n_tensors; // GGUFv2 uint64_t n_kv; // GGUFv2 @@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) struct gguf_context * gguf_init_empty(void) { struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); - ctx->header.magic = GGUF_MAGIC; + memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; ctx->header.n_tensors = 0; ctx->header.n_kv = 0; @@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // offset from start of file size_t offset = 0; - uint32_t magic = 0; + char magic[4]; // check the magic before making allocations { gguf_fread_el(file, &magic, sizeof(magic), &offset); - if (magic != GGUF_MAGIC) { - fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); - fclose(file); - return NULL; + for (uint32_t i = 0; i < sizeof(magic); i++) { + if (magic[i] != GGUF_MAGIC[i]) { + fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic); + fclose(file); + return NULL; + } } } @@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the header { - ctx->header.magic = magic; + strncpy(ctx->header.magic, magic, 4); + ctx->kv = NULL; ctx->infos = NULL; diff --git a/ggml.h b/ggml.h index 6e35888e9..16aaf169e 100644 --- a/ggml.h +++ b/ggml.h @@ -231,8 +231,9 @@ #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 -#define GGUF_MAGIC 0x46554747 // "GGUF" -#define GGUF_VERSION 2 +#define GGUF_MAGIC "GGUF" + +#define GGUF_VERSION 3 #define GGUF_DEFAULT_ALIGNMENT 32 diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 557ce7ac0..072c839c4 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -19,9 +19,10 @@ import numpy as np # GGUF_MAGIC = 0x46554747 -GGUF_VERSION = 2 +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 + # general KEY_GENERAL_ARCHITECTURE = "general.architecture" KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version" @@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum): Q6_K = 14 Q8_K = 15 +class GGUFEndian(IntEnum): + LITTLE = 0 + BIG = 1 + class GGUFValueType(IntEnum): UINT8 = 0 @@ -644,18 +649,41 @@ class GGUFWriter: temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None tensors: list[tuple[np.ndarray[Any, Any], int]] - def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True): + @property + def pack_prefix(self): + if self.endianess==GGUFEndian.LITTLE: + return "<" + else: + return ">" + + def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE): self.fout = open(path, "wb") self.arch = arch + self.endianess = endianess + self._simple_value_packing = { + GGUFValueType.UINT8: f"{self.pack_prefix}B", + GGUFValueType.INT8: f"{self.pack_prefix}b", + GGUFValueType.UINT16: f"{self.pack_prefix}H", + GGUFValueType.INT16: f"{self.pack_prefix}h", + GGUFValueType.UINT32: f"{self.pack_prefix}I", + GGUFValueType.INT32: f"{self.pack_prefix}i", + GGUFValueType.FLOAT32: f"{self.pack_prefix}f", + GGUFValueType.UINT64: f"{self.pack_prefix}Q", + GGUFValueType.INT64: f"{self.pack_prefix}q", + GGUFValueType.FLOAT64: f"{self.pack_prefix}d", + GGUFValueType.BOOL: "?" , + } self.add_architecture() self.use_temp_file = use_temp_file self.tensors = [] + endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian" + print(f"This gguf file is for {endianess_str} only") def write_header_to_file(self): self.fout.write(struct.pack(" 0: ltype = GGUFValueType.get_type(val[0]) if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): raise ValueError("All items in a GGUF array should be of the same type") - self.kv_data += struct.pack(""] packages = [ diff --git a/k_quants.c b/k_quants.c index e168a87bb..801941fbe 100644 --- a/k_quants.c +++ b/k_quants.c @@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) { #if defined(_MSC_VER) || defined(__MINGW32__) #include #else -#if !defined(__riscv) +#if !defined(__riscv) && !defined(__s390__) #include #endif #endif diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp index b506f273f..afd7bf77f 100644 --- a/tests/test-double-float.cpp +++ b/tests/test-double-float.cpp @@ -4,7 +4,9 @@ #undef NDEBUG #include +#if !defined(__riscv) && !defined(__s390__) #include +#endif #include #include #include