py : cleanup the code

- use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default
2023-03-29 21:31:24 +02:00 · 2023-03-29 21:31:24 +02:00 · cbef542879
parent 9733104be5
commit cbef542879
6 changed files with 27 additions and 29 deletions
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@ -27,9 +27,9 @@ def read_tokens(fin, vocab_size):
        text_len = struct.unpack("i", fin.read(4))[0]
        text_bytes = fin.read(text_len)
        try:
-            text = text_bytes.decode("utf-8")
+            text = text_bytes.decode()
        except UnicodeDecodeError:
-            text = text_bytes.decode("utf-8", "replace")
+            text = text_bytes.decode(errors="replace")
        score = struct.unpack("f", fin.read(4))[0]
        tokens.append((text, score))
    return tokens
@ -82,7 +82,7 @@ def read_variables(fin):
        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
        shape = shape[::-1]
-        name = fin.read(name_length).decode("utf-8")
+        name = fin.read(name_length).decode()
        # ensure tensor data is aligned
        tensor_data_offset = fin.tell()
@ -207,11 +207,11 @@ AI: Hello! How can I assist you today?
    print(ctx.rstrip("\n"))
    while True:
        print("-" * 60)
-        prompt = input(f"User: ")
+        prompt = input("User: ")
        if ctx != "":
-            ctx = ctx + "User: " + prompt + "\n"
+            ctx = f"{ctx}User: {prompt}\n"
        else:
-            ctx = prompt + "\nAI:"
+            ctx = f"{prompt}\nAI:"
        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
@ -236,7 +236,7 @@ AI: Hello! How can I assist you today?
                )
            s = generation_output.sequences[0]
            decoded = tokenizer.decode(s)
-            ctx = decoded + "\n"
+            ctx = f"{decoded}\n"
 def main():
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@ -49,7 +49,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode("utf-8")
+    text = "<pad>".encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", 0.0))
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode("utf-8")
+        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
@ -61,13 +61,13 @@ for i in range(tokenizer.vocab_size()):
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))
 def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode('utf-8')
+    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)
@ -80,7 +80,7 @@ def write_header(shape, dst_name, ftype_cur):
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
-    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
@ -105,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)
-    print("Processing Q4 variable: " + src_name + " with shape: ", shape)
+    print(f"Processing Q4 variable: {src_name} with shape: {shape}")
    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
@ -168,5 +168,5 @@ for i in range(n_layer):
 fout.close()
-print("Done. Output file: " + fname_out)
+print(f"Done. Output file: {fname_out}")
-print("")
+print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -120,7 +120,7 @@ def write_header(fout, hparams, ftype):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@ -131,7 +131,7 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
@ -191,7 +191,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
-        sname = name.encode('utf-8')
+        sname = name.encode()
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@ -44,7 +44,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@ -55,7 +55,7 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@ -272,13 +272,11 @@ def main():
        tokens = read_tokens(fin, hparams)
    if hparams['magic'] == 0x67676a74:  # ggjt
-        print("%s: input ggml has already been converted to 'ggjt' magic\n" %
+        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
              (args.fin_path))
        sys.exit(1)
    if hparams['magic'] != 0x67676d66:  # ggmf
-        print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
+        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
              (args.fin_path, hparams['magic']))
        sys.exit(1)
    hparams['magic'] = 0x67676a74  # ggjt
@ -286,7 +284,7 @@ def main():
    # count number of multipart files by convention
    n_parts = 1
    while True:
-        if os.path.exists("%s.%d" % (args.fin_path, n_parts)):
+        if os.path.exists(f"{args.fin_path}.{n_parts}"):
            n_parts += 1
        else:
            break
@ -302,7 +300,7 @@ def main():
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fin_path = args.fin_path
            if part_id > 0:
-                fin_path += ".%d" % (part_id)
+                fin_path += f".{part_id}"
            with open(fin_path, "rb") as fin:
                read_tokens(fin, read_hparams(fin))
                copy_tensors(fin, fout, part_id, n_parts)