make-ggml.py : compatibility with more models and GGUF (#3290)

* Resync my fork with new llama.cpp commits

* examples : rename to use dash instead of underscore

* New model conversions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Richard Roberson 2023-09-27 10:25:12 -06:00 committed by GitHub
parent 20c7e1e804
commit ac43576124
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1,22 +1,25 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
This script converts Hugging Face llama models to GGML and quantizes them. This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
Usage: Usage:
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)] python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
Arguments: Arguments:
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub. - model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used. - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used. - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'. - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created. - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
Quant types: Old quant types (some base model types require these):
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
New quant types (recommended):
- Q2_K: smallest, extreme quality loss - not recommended - Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M - Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss - Q3_K_S: very small, very high quality loss
@ -40,9 +43,7 @@ import argparse
import os import os
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
def main(model, outname, outdir, quants, keep_fp16): def main(model, model_type, outname, outdir, quants, keep_fp16):
ggml_version = "v3"
if not os.path.isdir(model): if not os.path.isdir(model):
print(f"Model not found at {model}. Downloading...") print(f"Model not found at {model}. Downloading...")
try: try:
@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16):
print("Building llama.cpp") print("Building llama.cpp")
subprocess.run(f"cd .. && make quantize", shell=True, check=True) subprocess.run(f"cd .. && make quantize", shell=True, check=True)
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin" fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
print(f"Making unquantised GGML at {fp16}") print(f"Making unquantised GGUF at {fp16}")
if not os.path.isfile(fp16): if not os.path.isfile(fp16):
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True) if model_type != "llama":
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
else:
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
else: else:
print(f"Unquantised GGML already exists at: {fp16}") print(f"Unquantised GGML already exists at: {fp16}")
print("Making quants") print("Making quants")
for type in quants: for type in quants:
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin" outfile = f"{outdir}/{outname}.gguf.{type}.bin"
print(f"Making {type} : {outfile}") print(f"Making {type} : {outfile}")
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True) subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16):
os.remove(fp16) os.remove(fp16)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.') parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name') parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
parser.add_argument('--outname', default=None, help='Output model(s) name') parser.add_argument('--outname', default=None, help='Output model(s) name')
parser.add_argument('--outdir', default=None, help='Output directory') parser.add_argument('--outdir', default=None, help='Output directory')
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types') parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
@ -90,4 +95,4 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16) main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)