diff --git a/README.md b/README.md index cefcfb7ca..07066cd81 100644 --- a/README.md +++ b/README.md @@ -155,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece # convert the 7B model to ggml FP16 format python3 convert-pth-to-ggml.py models/7B/ 1 -# quantize the model to 4-bits -python3 quantize.py 7B +# quantize the model to 4-bits (using method 2 = q4_0) +./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2 # run the inference ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 diff --git a/quantize.py b/quantize.py deleted file mode 100644 index 641df8dda..000000000 --- a/quantize.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 - -"""Script to execute the "quantize" script on a given set of models.""" - -import subprocess -import argparse -import glob -import sys -import os - - -def main(): - """Update the quantize binary name depending on the platform and parse - the command line arguments and execute the script. - """ - - if "linux" in sys.platform or "darwin" in sys.platform: - quantize_script_binary = "quantize" - - elif "win32" in sys.platform or "cygwin" in sys.platform: - quantize_script_binary = "quantize.exe" - - else: - print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n") - quantize_script_binary = "quantize" - - parser = argparse.ArgumentParser( - prog='python3 quantize.py', - description='This script quantizes the given models by applying the ' - f'"{quantize_script_binary}" script on them.' - ) - parser.add_argument( - 'models', nargs='+', choices=('7B', '13B', '30B', '65B'), - help='The models to quantize.' - ) - parser.add_argument( - '-r', '--remove-16', action='store_true', dest='remove_f16', - help='Remove the f16 model after quantizing it.' - ) - parser.add_argument( - '-m', '--models-path', dest='models_path', - default=os.path.join(os.getcwd(), "models"), - help='Specify the directory where the models are located.' - ) - parser.add_argument( - '-q', '--quantize-script-path', dest='quantize_script_path', - default=os.path.join(os.getcwd(), quantize_script_binary), - help='Specify the path to the "quantize" script.' - ) - - # TODO: Revise this code - # parser.add_argument( - # '-t', '--threads', dest='threads', type='int', - # default=os.cpu_count(), - # help='Specify the number of threads to use to quantize many models at ' - # 'once. Defaults to os.cpu_count().' - # ) - - args = parser.parse_args() - args.models_path = os.path.abspath(args.models_path) - - if not os.path.isfile(args.quantize_script_path): - print( - f'The "{quantize_script_binary}" script was not found in the ' - "current location.\nIf you want to use it from another location, " - "set the --quantize-script-path argument from the command line." - ) - sys.exit(1) - - for model in args.models: - # The model is separated in various parts - # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...) - f16_model_path_base = os.path.join( - args.models_path, model, "ggml-model-f16.bin" - ) - - if not os.path.isfile(f16_model_path_base): - print(f'The file %s was not found' % f16_model_path_base) - sys.exit(1) - - f16_model_parts_paths = map( - lambda filename: os.path.join(f16_model_path_base, filename), - glob.glob(f"{f16_model_path_base}*") - ) - - for f16_model_part_path in f16_model_parts_paths: - if not os.path.isfile(f16_model_part_path): - print( - f"The f16 model {os.path.basename(f16_model_part_path)} " - f"was not found in {args.models_path}{os.path.sep}{model}" - ". If you want to use it from another location, set the " - "--models-path argument from the command line." - ) - sys.exit(1) - - __run_quantize_script( - args.quantize_script_path, f16_model_part_path - ) - - if args.remove_f16: - os.remove(f16_model_part_path) - - -# This was extracted to a top-level function for parallelization, if -# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406 - -def __run_quantize_script(script_path, f16_model_part_path): - """Run the quantize script specifying the path to it and the path to the - f16 model to quantize. - """ - - new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0") - subprocess.run( - [script_path, f16_model_part_path, new_quantized_model_path, "2"], - check=True - ) - - -if __name__ == "__main__": - try: - main() - - except subprocess.CalledProcessError: - print("\nAn error ocurred while trying to quantize the models.") - sys.exit(1) - - except KeyboardInterrupt: - sys.exit(0) - - else: - print("\nSuccesfully quantized all models.")