#!/usr/bin/env python3 # train-text-from-scratch checkpoint --> gguf conversion import argparse import os import struct import sys import numpy as np from pathlib import Path if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py')) import gguf # gguf constants LLM_KV_OPTIMIZER_TYPE = "optimizer.type" LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" LLM_KV_TRAINING_TYPE = "training.type" LLM_KV_TRAINING_FILE_VERSION = "training.file_version" LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" class Tensor: def __init__(self, dtype='f', ne=None): if ne is None: ne = [] self.dtype = dtype self.ne = ne self.nbytes = 0 if self.dtype == 'f': if len(self.ne) == 0: self.nbytes = 0 else: self.nbytes = int(np.product(self.ne)) * 4 else: raise ValueError(f"Unhandled data type '{self.dtype}'") def load(self, data, offset): nd = struct.unpack(' 0 else []) self.lbfgs_x = Tensor('f', [self.nx]) self.lbfgs_xp = Tensor('f', [self.nx]) self.lbfgs_g = Tensor('f', [self.nx]) self.lbfgs_gp = Tensor('f', [self.nx]) self.lbfgs_d = Tensor('f', [self.nx]) self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) if self.type == 0: # these tensors are stored, but we don't need their data x = Tensor('f', [self.nx]) g = Tensor('f', [self.nx]) g2 = Tensor('f', [self.nx]) mh = Tensor('f', [self.nx]) vh = Tensor('f', [self.nx]) offset = x.load(data, offset) offset = g.load(data, offset) offset = g2.load(data, offset) offset = self.adam_m.load(data, offset) offset = self.adam_v.load(data, offset) offset = mh.load(data, offset) offset = vh.load(data, offset) offset = self.adam_pf.load(data, offset) self.adam_fx_best = struct.unpack(' 0 else []) self.lbfgs_x = Tensor('f', [self.nx]) self.lbfgs_xp = Tensor('f', [self.nx]) self.lbfgs_g = Tensor('f', [self.nx]) self.lbfgs_gp = Tensor('f', [self.nx]) self.lbfgs_d = Tensor('f', [self.nx]) self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) # forgot to save type in version 1: # guess self.type from number of remaining bytes size_type_0 = 12 + sum([t.max_storage_size() for t in [self.adam_m, self.adam_v] +([self.adam_pf] if (self.past > 0) else [])]) size_type_1 = 24 + sum([t.max_storage_size() for t in [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, self.lbfgs_lmal, self.lbfgs_lmys, self.lbfgs_lms, self.lbfgs_lmy] +([self.lbfgs_pf] if (self.past > 0) else [])]) # due to alignment padding the size might not by exact # but the difference in size for both types is significant, # so we can just use whichever is closest remaining = len(data) - offset if abs(remaining - size_type_0) < abs(remaining - size_type_1): self.type = 0 else: self.type = 1 if self.type == 0: offset = self.adam_m.load(data, offset) offset = self.adam_v.load(data, offset) offset = self.adam_pf.load(data,offset) self.adam_fx_best = struct.unpack(' 0: self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) elif self.type == 1: gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) if self.past > 0: self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) else: raise ValueError('Unknown optimizer type') class ModelParams: def __init__(self): pass def load(self, data, offset): self.n_vocab = struct.unpack('