From 5e47e223bd74af8a16cd4f2900cfbcfb92cd1f18 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 15 Apr 2023 13:21:27 +0300 Subject: [PATCH] whisper : add Core ML support (#566) * coreml : use Core ML encoder inference * coreml : simlpify whisper_encode + log messages * whisper : resolve rebase conflicts * coreml : add scripts for CoreML model generation * bench-all : recognize COREML flag --- .gitignore | 5 +- CMakeLists.txt | 68 +++++- Makefile | 48 ++-- coreml/whisper-decoder-impl.h | 146 ++++++++++++ coreml/whisper-decoder-impl.m | 201 +++++++++++++++++ coreml/whisper-encoder-impl.h | 142 ++++++++++++ coreml/whisper-encoder-impl.m | 197 ++++++++++++++++ coreml/whisper-encoder.h | 22 ++ coreml/whisper-encoder.mm | 67 ++++++ extra/bench-all.sh | 4 + models/convert-whisper-to-coreml.py | 334 ++++++++++++++++++++++++++++ models/download-coreml-model.sh | 82 +++++++ models/generate-coreml-interface.sh | 29 +++ models/generate-coreml-model.sh | 25 +++ whisper.cpp | 60 +++++ 15 files changed, 1404 insertions(+), 26 deletions(-) create mode 100644 coreml/whisper-decoder-impl.h create mode 100644 coreml/whisper-decoder-impl.m create mode 100644 coreml/whisper-encoder-impl.h create mode 100644 coreml/whisper-encoder-impl.m create mode 100644 coreml/whisper-encoder.h create mode 100644 coreml/whisper-encoder.mm create mode 100644 models/convert-whisper-to-coreml.py create mode 100755 models/download-coreml-model.sh create mode 100755 models/generate-coreml-interface.sh create mode 100755 models/generate-coreml-model.sh diff --git a/.gitignore b/.gitignore index 3d51e0b..67ec7c3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.o *.a .cache/ +.coreml/ .test/ .vs/ .vscode/ @@ -35,4 +36,6 @@ examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata extra/bench-gg.txt -*.mlmodel* +models/*.mlmodel +models/*.mlmodelc +models/*.mlpackage diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b6d4b7..3736423 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,8 @@ if (APPLE) option(WHISPER_NO_AVX "whisper: disable AVX" OFF) option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF) option(WHISPER_NO_FMA "whisper: disable FMA" OFF) + + option(WHISPER_COREML "whisper: enable Core ML framework" OFF) else() option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF) endif() @@ -90,16 +92,33 @@ endif() find_package(Threads REQUIRED) -# on APPLE - include Accelerate framework -if (APPLE AND NOT WHISPER_NO_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") +# on APPLE +if (APPLE) + # include Accelerate framework + if (NOT WHISPER_NO_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) - set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) - else() - message(WARNING "Accelerate framework not found") + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) + else() + message(WARNING "Accelerate framework not found") + endif() + endif() + + if (WHISPER_COREML) + find_library(FOUNDATION_FRAMEWORK Foundation) + find_library(COREML_FRAMEWORK CoreML) + + if (COREML_FRAMEWORK) + message(STATUS "CoreML framework found") + + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML) + else() + message(WARNING "CoreML framework not found") + endif() endif() endif() @@ -187,6 +206,33 @@ if (WHISPER_PERF) set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF) endif() +# +# whisper.coreml - Core ML support +# + +if (WHISPER_COREML) + set(TARGET whisper.coreml) + + add_library(${TARGET} + coreml/whisper-encoder.h + coreml/whisper-encoder.mm + coreml/whisper-encoder-impl.h + coreml/whisper-encoder-impl.m + ) + + include(DefaultTargetOptions) + + target_include_directories(${TARGET} PUBLIC + . + ) + + target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK}) + + set_target_properties(${TARGET} PROPERTIES + COMPILE_FLAGS "-fobjc-arc" + ) +endif() + # # whisper - this is the main library of the project # @@ -206,6 +252,10 @@ target_include_directories(${TARGET} PUBLIC . ) +if (WHISPER_COREML) + target_link_libraries(${TARGET} PRIVATE whisper.coreml) +endif() + if (MSVC) target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/Makefile b/Makefile index 9454b3a..c452b88 100644 --- a/Makefile +++ b/Makefile @@ -140,6 +140,10 @@ ifndef WHISPER_NO_ACCELERATE LDFLAGS += -framework Accelerate endif endif +ifdef WHISPER_COREML + CXXFLAGS += -DWHISPER_USE_COREML + LDFLAGS += -framework Foundation -framework CoreML +endif ifdef WHISPER_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas LDFLAGS += -lopenblas @@ -195,11 +199,23 @@ ggml.o: ggml.c ggml.h whisper.o: whisper.cpp whisper.h ggml.h $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o -libwhisper.a: ggml.o whisper.o - $(AR) rcs libwhisper.a ggml.o whisper.o +ifndef WHISPER_COREML +WHISPER_OBJ = whisper.o +else +whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h + $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o -libwhisper.so: ggml.o whisper.o - $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS) +whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h + $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o + +WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o +endif + +libwhisper.a: ggml.o $(WHISPER_OBJ) + $(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ) + +libwhisper.so: ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS) clean: rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so @@ -213,24 +229,24 @@ CC_SDL=`sdl2-config --cflags --libs` SRC_COMMON = examples/common.cpp SRC_COMMON_SDL = examples/common-sdl.cpp -main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS) +main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS) ./main -h -bench: examples/bench/bench.cpp ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS) +bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS) -stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS) +stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS) -command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS) +command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS) -talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS) +talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS) -talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS) +talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS) # # Audio samples diff --git a/coreml/whisper-decoder-impl.h b/coreml/whisper-decoder-impl.h new file mode 100644 index 0000000..c6f2e85 --- /dev/null +++ b/coreml/whisper-decoder-impl.h @@ -0,0 +1,146 @@ +// +// whisper-decoder-impl.h +// +// This file was automatically generated and should not be edited. +// + +#import +#import +#include +#include + +NS_ASSUME_NONNULL_BEGIN + + +/// Model Prediction Input Type +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface whisper_decoder_implInput : NSObject + +/// token_data as 1 by 1 matrix of 32-bit integers +@property (readwrite, nonatomic, strong) MLMultiArray * token_data; + +/// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * audio_data; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER; + +@end + + +/// Model Prediction Output Type +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface whisper_decoder_implOutput : NSObject + +/// var_1346 as multidimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * var_1346; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER; + +@end + + +/// Class for model loading and prediction +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface whisper_decoder_impl : NSObject +@property (readonly, nonatomic, nullable) MLModel * model; + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle; + +/** + Initialize whisper_decoder_impl instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER; + +/** + Initialize whisper_decoder_impl instance with the model in this bundle. +*/ +- (nullable instancetype)init; + +/** + Initialize whisper_decoder_impl instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize whisper_decoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize whisper_decoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Construct whisper_decoder_impl instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler; + +/** + Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler; + +/** + Make a prediction using the standard interface + @param input an instance of whisper_decoder_implInput to predict from + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as whisper_decoder_implOutput +*/ +- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the standard interface + @param input an instance of whisper_decoder_implInput to predict from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as whisper_decoder_implOutput +*/ +- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the convenience interface + @param token_data as 1 by 1 matrix of 32-bit integers: + @param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats: + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as whisper_decoder_implOutput +*/ +- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Batch prediction + @param inputArray array of whisper_decoder_implInput instances to obtain predictions from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the predictions as NSArray +*/ +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; +@end + +NS_ASSUME_NONNULL_END diff --git a/coreml/whisper-decoder-impl.m b/coreml/whisper-decoder-impl.m new file mode 100644 index 0000000..34060e4 --- /dev/null +++ b/coreml/whisper-decoder-impl.m @@ -0,0 +1,201 @@ +// +// whisper-decoder-impl.m +// +// This file was automatically generated and should not be edited. +// + +#if !__has_feature(objc_arc) +#error This file must be compiled with automatic reference counting enabled (-fobjc-arc) +#endif + +#import "whisper-decoder-impl.h" + +@implementation whisper_decoder_implInput + +- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data { + self = [super init]; + if (self) { + _token_data = token_data; + _audio_data = audio_data; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"token_data", @"audio_data"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"token_data"]) { + return [MLFeatureValue featureValueWithMultiArray:self.token_data]; + } + if ([featureName isEqualToString:@"audio_data"]) { + return [MLFeatureValue featureValueWithMultiArray:self.audio_data]; + } + return nil; +} + +@end + +@implementation whisper_decoder_implOutput + +- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 { + self = [super init]; + if (self) { + _var_1346 = var_1346; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"var_1346"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"var_1346"]) { + return [MLFeatureValue featureValueWithMultiArray:self.var_1346]; + } + return nil; +} + +@end + +@implementation whisper_decoder_impl + + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle { + NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_decoder_impl" ofType:@"mlmodelc"]; + if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-decoder-impl.mlmodelc in the bundle resource"); return nil; } + return [NSURL fileURLWithPath:assetPath]; +} + + +/** + Initialize whisper_decoder_impl instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model { + self = [super init]; + if (!self) { return nil; } + _model = model; + if (_model == nil) { return nil; } + return self; +} + + +/** + Initialize whisper_decoder_impl instance with the model in this bundle. +*/ +- (nullable instancetype)init { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil]; +} + + +/** + Initialize whisper_decoder_impl instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error]; +} + + +/** + Initialize whisper_decoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Initialize whisper_decoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Construct whisper_decoder_impl instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler { + [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle] + configuration:configuration + completionHandler:handler]; +} + + +/** + Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler { + [MLModel loadContentsOfURL:modelURL + configuration:configuration + completionHandler:^(MLModel *model, NSError *error) { + if (model != nil) { + whisper_decoder_impl *typedModel = [[whisper_decoder_impl alloc] initWithMLModel:model]; + handler(typedModel, nil); + } else { + handler(nil, error); + } + }]; +} + +- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error]; +} + +- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id outFeatures = [self.model predictionFromFeatures:input options:options error:error]; + if (!outFeatures) { return nil; } + return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue]; +} + +- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error { + whisper_decoder_implInput *input_ = [[whisper_decoder_implInput alloc] initWithToken_data:token_data audio_data:audio_data]; + return [self predictionFromFeatures:input_ error:error]; +} + +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray]; + id outBatch = [self.model predictionsFromBatch:inBatch options:options error:error]; + if (!outBatch) { return nil; } + NSMutableArray *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count]; + for (NSInteger i = 0; i < outBatch.count; i++) { + id resultProvider = [outBatch featuresAtIndex:i]; + whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue]; + [results addObject:result]; + } + return results; +} + +@end diff --git a/coreml/whisper-encoder-impl.h b/coreml/whisper-encoder-impl.h new file mode 100644 index 0000000..ecb6155 --- /dev/null +++ b/coreml/whisper-encoder-impl.h @@ -0,0 +1,142 @@ +// +// whisper-encoder-impl.h +// +// This file was automatically generated and should not be edited. +// + +#import +#import +#include +#include + +NS_ASSUME_NONNULL_BEGIN + + +/// Model Prediction Input Type +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface whisper_encoder_implInput : NSObject + +/// logmel_data as 1 × 80 × 3000 3-dimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * logmel_data; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data NS_DESIGNATED_INITIALIZER; + +@end + + +/// Model Prediction Output Type +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface whisper_encoder_implOutput : NSObject + +/// output as multidimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * output; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER; + +@end + + +/// Class for model loading and prediction +API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) +@interface whisper_encoder_impl : NSObject +@property (readonly, nonatomic, nullable) MLModel * model; + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle; + +/** + Initialize whisper_encoder_impl instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER; + +/** + Initialize whisper_encoder_impl instance with the model in this bundle. +*/ +- (nullable instancetype)init; + +/** + Initialize whisper_encoder_impl instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize whisper_encoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize whisper_encoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Construct whisper_encoder_impl instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler; + +/** + Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler; + +/** + Make a prediction using the standard interface + @param input an instance of whisper_encoder_implInput to predict from + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as whisper_encoder_implOutput +*/ +- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the standard interface + @param input an instance of whisper_encoder_implInput to predict from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as whisper_encoder_implOutput +*/ +- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the convenience interface + @param logmel_data as 1 × 80 × 3000 3-dimensional array of floats: + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as whisper_encoder_implOutput +*/ +- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Batch prediction + @param inputArray array of whisper_encoder_implInput instances to obtain predictions from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the predictions as NSArray +*/ +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; +@end + +NS_ASSUME_NONNULL_END diff --git a/coreml/whisper-encoder-impl.m b/coreml/whisper-encoder-impl.m new file mode 100644 index 0000000..ee8e506 --- /dev/null +++ b/coreml/whisper-encoder-impl.m @@ -0,0 +1,197 @@ +// +// whisper-encoder-impl.m +// +// This file was automatically generated and should not be edited. +// + +#if !__has_feature(objc_arc) +#error This file must be compiled with automatic reference counting enabled (-fobjc-arc) +#endif + +#import "whisper-encoder-impl.h" + +@implementation whisper_encoder_implInput + +- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data { + self = [super init]; + if (self) { + _logmel_data = logmel_data; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"logmel_data"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"logmel_data"]) { + return [MLFeatureValue featureValueWithMultiArray:self.logmel_data]; + } + return nil; +} + +@end + +@implementation whisper_encoder_implOutput + +- (instancetype)initWithOutput:(MLMultiArray *)output { + self = [super init]; + if (self) { + _output = output; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"output"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"output"]) { + return [MLFeatureValue featureValueWithMultiArray:self.output]; + } + return nil; +} + +@end + +@implementation whisper_encoder_impl + + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle { + NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_encoder_impl" ofType:@"mlmodelc"]; + if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-encoder-impl.mlmodelc in the bundle resource"); return nil; } + return [NSURL fileURLWithPath:assetPath]; +} + + +/** + Initialize whisper_encoder_impl instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model { + self = [super init]; + if (!self) { return nil; } + _model = model; + if (_model == nil) { return nil; } + return self; +} + + +/** + Initialize whisper_encoder_impl instance with the model in this bundle. +*/ +- (nullable instancetype)init { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil]; +} + + +/** + Initialize whisper_encoder_impl instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error]; +} + + +/** + Initialize whisper_encoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Initialize whisper_encoder_impl instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Construct whisper_encoder_impl instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler { + [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle] + configuration:configuration + completionHandler:handler]; +} + + +/** + Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler { + [MLModel loadContentsOfURL:modelURL + configuration:configuration + completionHandler:^(MLModel *model, NSError *error) { + if (model != nil) { + whisper_encoder_impl *typedModel = [[whisper_encoder_impl alloc] initWithMLModel:model]; + handler(typedModel, nil); + } else { + handler(nil, error); + } + }]; +} + +- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error]; +} + +- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id outFeatures = [self.model predictionFromFeatures:input options:options error:error]; + if (!outFeatures) { return nil; } + return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue]; +} + +- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error { + whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data]; + return [self predictionFromFeatures:input_ error:error]; +} + +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray]; + id outBatch = [self.model predictionsFromBatch:inBatch options:options error:error]; + if (!outBatch) { return nil; } + NSMutableArray *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count]; + for (NSInteger i = 0; i < outBatch.count; i++) { + id resultProvider = [outBatch featuresAtIndex:i]; + whisper_encoder_implOutput * result = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue]; + [results addObject:result]; + } + return results; +} + +@end diff --git a/coreml/whisper-encoder.h b/coreml/whisper-encoder.h new file mode 100644 index 0000000..84bbe41 --- /dev/null +++ b/coreml/whisper-encoder.h @@ -0,0 +1,22 @@ +// Wrapper of the Core ML Whisper Encoder model +// +// Code is derived from the work of Github user @wangchou +// ref: https://github.com/wangchou/callCoreMLFromCpp + +#if __cplusplus +extern "C" { +#endif + +struct whisper_coreml_context; + +struct whisper_coreml_context * whisper_coreml_init(const char * path_model); +void whisper_coreml_free(struct whisper_coreml_context * ctx); + +void whisper_coreml_encode( + const whisper_coreml_context * ctx, + float * mel, + float * out); + +#if __cplusplus +} +#endif diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm new file mode 100644 index 0000000..dd08f0f --- /dev/null +++ b/coreml/whisper-encoder.mm @@ -0,0 +1,67 @@ +#import "coreml/whisper-encoder.h" +#import "coreml/whisper-encoder-impl.h" + +#import + +#include + +#if __cplusplus +extern "C" { +#endif + +struct whisper_coreml_context { + const void * data; +}; + +struct whisper_coreml_context * whisper_coreml_init(const char * path_model) { + NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model]; + + NSURL * url_model = [NSURL fileURLWithPath: path_model_str]; + + const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]); + + if (data == NULL) { + return NULL; + } + + whisper_coreml_context * ctx = new whisper_coreml_context; + + ctx->data = data; + + return ctx; +} + +void whisper_coreml_free(struct whisper_coreml_context * ctx) { + CFRelease(ctx->data); + delete ctx; +} + +void whisper_coreml_encode( + const whisper_coreml_context * ctx, + float * mel, + float * out) { + MLMultiArray * inMultiArray = [ + [MLMultiArray alloc] initWithDataPointer: mel + shape: @[@1, @80, @3000] + dataType: MLMultiArrayDataTypeFloat32 + strides: @[@(240000), @(3000), @1] + deallocator: nil + error: nil + ]; + + whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil]; + + MLMultiArray * outMA = outCoreML.output; + + //NSArray * shape = outMA.shape; + //NSArray * strides = outMA.strides; + + //printf("shape: %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]); + //printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]); + + memcpy(out, outMA.dataPointer, outMA.count * sizeof(float)); +} + +#if __cplusplus +} +#endif diff --git a/extra/bench-all.sh b/extra/bench-all.sh index fbfc877..9297378 100755 --- a/extra/bench-all.sh +++ b/extra/bench-all.sh @@ -64,6 +64,10 @@ for model in "${models[@]}"; do config="$config BLAS" fi + if [[ $system_info == *"COREML = 1"* ]]; then + config="$config COREML" + fi + commit=$(git rev-parse --short HEAD) printf "| | | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n" diff --git a/models/convert-whisper-to-coreml.py b/models/convert-whisper-to-coreml.py new file mode 100644 index 0000000..489854e --- /dev/null +++ b/models/convert-whisper-to-coreml.py @@ -0,0 +1,334 @@ +import argparse +import torch +import torch.nn.functional as F +import coremltools as ct + +from torch import Tensor +from torch import nn +from typing import Dict +from typing import Optional +from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase +from coremltools.models.neural_network.quantization_utils import quantize_weights +from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions +from whisper import load_model + +# Use for changing dim of input in encoder and decoder embeddings +def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + """ + Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights + """ + for k in state_dict: + is_attention = all(substr in k for substr in ['attn', '.weight']) + is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']]) + + if (is_attention or is_mlp) and len(state_dict[k].shape) == 2: + state_dict[k] = state_dict[k][:, :, None, None] + + +def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata, + strict, missing_keys, + unexpected_keys, error_msgs): + state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight'] + return state_dict + +class LayerNormANE(LayerNormANEBase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._register_load_state_dict_pre_hook( + correct_for_bias_scale_order_inversion) + +class MultiHeadAttentionANE(MultiHeadAttention): + def __init__(self, n_state: int, n_head: int): + super().__init__(n_state, n_head) + + setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1)) + setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)) + setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1)) + setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1)) + + def forward(self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None): + + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv, qk = self.qkv_attention_ane(q, k, v, mask) + + return self.out(wv), qk + + def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None): + + _, dim, _, seqlen = q.size() + + dim_per_head = dim // self.n_head + + scale = float(dim_per_head)**-0.5 + + q = q * scale + + mh_q = q.split(dim_per_head, dim=1) + mh_k = k.transpose(1,3).split(dim_per_head, dim=3) + mh_v = v.split(dim_per_head, dim=1) + + mh_qk = [ + torch.einsum('bchq,bkhc->bkhq', [qi, ki]) + for qi, ki in zip(mh_q, mh_k) + ] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads + + if mask is not None: + for head_idx in range(self.n_head): + mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen] + + attn_weights = [aw.softmax(dim=1) for aw in mh_qk] # (batch_size, max_seq_length, 1, max_seq_length) * n_heads + attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)] # (batch_size, dim_per_head, 1, max_seq_length) * n_heads + attn = torch.cat(attn, dim=1) # (batch_size, dim, 1, max_seq_length) + + return attn, torch.cat(mh_qk, dim=1).float().detach() + + +class ResidualAttentionBlockANE(ResidualAttentionBlock): + def __init__(self, n_state: int, n_head: int, cross_attention: bool = False): + super().__init__(n_state, n_head, cross_attention) + + setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head)) + setattr(self, 'attn_ln', LayerNormANE(n_state)) + + setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None) + setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None) + + n_mlp = n_state * 4 + setattr(self, 'mlp', nn.Sequential( + nn.Conv2d(n_state, n_mlp, kernel_size=1), + nn.GELU(), + nn.Conv2d(n_mlp, n_state, kernel_size=1) + )) + setattr(self, 'mlp_ln', LayerNormANE(n_state)) + + +class AudioEncoderANE(AudioEncoder): + def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): + super().__init__(n_mels, n_ctx, n_state, n_head, n_layer) + + setattr(self, 'blocks', nn.ModuleList( + [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)] + )) + setattr(self, 'ln_post', LayerNormANE(n_state)) + + def forward(self, x: Tensor): + """ + x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) + the mel spectrogram of the audio + """ + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + + assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape" + + # Add positional embedding and add dummy dim for ANE + x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2) + + for block in self.blocks: + x = block(x) + + x = self.ln_post(x) + + # """ + # TODO: + # I think we need to transpose the result here to make it fit whisper.cpp memory order. + # However, even doing this, the results are still wrong. Kind of less wrong compared to + # not transposing, but still wrong. + + # Also, I don't know why the original OpenAI implementation does not need to transpose + + # transpose to (batch_size, n_ctx, n_state) + # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx) + + # """ + # x = x.transpose(1,3) + + return x + +class TextDecoderANE(TextDecoder): + + def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): + super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer) + + setattr(self, 'blocks', nn.ModuleList( + [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)] + )) + setattr(self, 'ln', LayerNormANE(n_state)) + + def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): + """ + x : torch.LongTensor, shape = (batch_size, <= n_ctx) + the text tokens + xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx) + the encoded audio features to be attended on + """ + offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0 + x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]] + x = x.to(xa.dtype) + + # Reformat for ANE + mask = self.mask[None, None, :, :].permute(0,3,1,2) + x = x.transpose(1,2).unsqueeze(2) + + for block in self.blocks: + x = block(x, xa, mask=mask, kv_cache=kv_cache) + + x = self.ln(x) + + # Reformat back from ANE + x = x.permute(0,2,3,1).squeeze(0) + + # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks + if self.token_embedding.weight.shape[0] == 51865: + # split in 11 chunks - 4715 each + splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0) + logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1) + else: + # split in 12 chunks - 4322 each + assert(self.token_embedding.weight.shape[0] == 51864) + splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0) + logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1) + + return logits + +class WhisperANE(Whisper): + def __init__(self, dims: ModelDimensions): + super().__init__(dims) + + setattr(self, 'encoder', AudioEncoderANE( + self.dims.n_mels, + self.dims.n_audio_ctx, + self.dims.n_audio_state, + self.dims.n_audio_head, + self.dims.n_audio_layer, + )) + setattr(self, 'decoder', TextDecoderANE( + self.dims.n_vocab, + self.dims.n_text_ctx, + self.dims.n_text_state, + self.dims.n_text_head, + self.dims.n_text_layer, + )) + + self._register_load_state_dict_pre_hook(linear_to_conv2d_map) + + def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]: + return self.decoder(tokens, self.encoder(mel)) + + def install_kv_cache_hooks(self, cache: Optional[dict] = None): + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]: + cache[module] = output # save as-is, for the first token or cross attention + else: + cache[module] = torch.cat([cache[module], output], dim=3).detach() + return cache[module] + + def install_hooks(layer: nn.Module): + if isinstance(layer, MultiHeadAttentionANE): + hooks.append(layer.key.register_forward_hook(save_to_cache)) + hooks.append(layer.value.register_forward_hook(save_to_cache)) + + self.decoder.apply(install_hooks) + return cache, hooks + +def convert_encoder(hparams, model, quantize=False): + model.eval() + + input_shape = (1, 80, 3000) + input_data = torch.randn(input_shape) + traced_model = torch.jit.trace(model, input_data) + + model = ct.convert( + traced_model, + convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why + inputs=[ct.TensorType(name="logmel_data", shape=input_shape)], + outputs=[ct.TensorType(name="output")], + compute_units=ct.ComputeUnit.ALL + ) + + if quantize: + model = quantize_weights(model, nbits=16) + + return model + +def convert_decoder(hparams, model, quantize=False): + model.eval() + + tokens_shape = (1, 1) + audio_shape = (1, hparams.n_audio_state, 1, 1500) + + audio_data = torch.randn(audio_shape) + token_data = torch.randint(50257, tokens_shape).long() + traced_model = torch.jit.trace(model, (token_data, audio_data)) + + model = ct.convert( + traced_model, + convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why + inputs=[ + ct.TensorType(name="token_data", shape=tokens_shape, dtype=int), + ct.TensorType(name="audio_data", shape=audio_shape) + ] + ) + + if quantize: + model = quantize_weights(model, nbits=16) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large)", required=True) + parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False) + parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False) + parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False) + args = parser.parse_args() + + if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]: + raise ValueError("Invalid model name") + + whisper = load_model(args.model).cpu() + hparams = whisper.dims + print(hparams) + + if args.optimize_ane: + whisperANE = WhisperANE(hparams).eval() + whisperANE.load_state_dict(whisper.state_dict()) + + encoder = whisperANE.encoder + decoder = whisperANE.decoder + else: + encoder = whisper.encoder + decoder = whisper.decoder + + # Convert encoder + encoder = convert_encoder(hparams, encoder, quantize=args.quantize) + encoder.save(f"models/coreml-encoder-{args.model}.mlpackage") + + if args.encoder_only is False: + # Convert decoder + decoder = convert_decoder(hparams, decoder, quantize=args.quantize) + decoder.save(f"models/coreml-decoder-{args.model}.mlpackage") + + print("done converting") diff --git a/models/download-coreml-model.sh b/models/download-coreml-model.sh new file mode 100755 index 0000000..d46789d --- /dev/null +++ b/models/download-coreml-model.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# This script downloads Whisper model files that have already been converted to Core ML format. +# This way you don't have to convert them yourself. + +src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml" +pfx="resolve/main/ggml" + +# get the path of this script +function get_script_path() { + if [ -x "$(command -v realpath)" ]; then + echo "$(dirname $(realpath $0))" + else + local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)" + echo "$ret" + fi +} + +models_path="$(get_script_path)" + +# Whisper models +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" ) + +# list available models +function list_models { + printf "\n" + printf " Available models:" + for model in "${models[@]}"; do + printf " $model" + done + printf "\n\n" +} + +if [ "$#" -ne 1 ]; then + printf "Usage: $0 \n" + list_models + + exit 1 +fi + +model=$1 + +if [[ ! " ${models[@]} " =~ " ${model} " ]]; then + printf "Invalid model: $model\n" + list_models + + exit 1 +fi + +# download Core ML model + +printf "Downloading Core ML model $model from '$src' ...\n" + +cd $models_path + +if [ -f "ggml-$model.mlmodel" ]; then + printf "Model $model already exists. Skipping download.\n" + exit 0 +fi + +if [ -x "$(command -v wget)" ]; then + wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel +elif [ -x "$(command -v curl)" ]; then + curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel +else + printf "Either wget or curl is required to download models.\n" + exit 1 +fi + + +if [ $? -ne 0 ]; then + printf "Failed to download Core ML model $model \n" + printf "Please try again later or download the original Whisper model files and convert them yourself.\n" + exit 1 +fi + +printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n" +printf "Run the following command to compile it:\n\n" +printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n" +printf "You can now use it like this:\n\n" +printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n" +printf "\n" diff --git a/models/generate-coreml-interface.sh b/models/generate-coreml-interface.sh new file mode 100755 index 0000000..553d5f6 --- /dev/null +++ b/models/generate-coreml-interface.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# This generates: +# - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m +# - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m +# + +wd=$(dirname "$0") +cd "$wd/../" + +python3 models/convert-whisper-to-coreml.py --model tiny.en + +mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage +xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/ +mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h +mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m +sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m +sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m +sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h + +mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage +xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/ +mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h +mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m +sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m +sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m +sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h + +rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage diff --git a/models/generate-coreml-model.sh b/models/generate-coreml-model.sh new file mode 100755 index 0000000..29d6b1d --- /dev/null +++ b/models/generate-coreml-model.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Usage: ./generate-coreml-model.sh +if [ $# -eq 0 ] + then + echo "No model name supplied" + echo "Usage: ./generate-coreml-model.sh " + exit 1 +fi + +mname="$1" + +wd=$(dirname "$0") +cd "$wd/../" + +python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True + +xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/ +rm -rf models/ggml-${mname}-encoder.mlmodelc +mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc + +# TODO: decoder (sometime in the future maybe) +#xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/ +#rm -rf models/ggml-${mname}-decoder.mlmodelc +#mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc diff --git a/whisper.cpp b/whisper.cpp index 0d67715..178766e 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1,5 +1,8 @@ #define WHISPER_BUILD #include "whisper.h" +#if WHISPER_USE_COREML +#include "coreml/whisper-encoder.h" +#endif #include "ggml.h" @@ -586,6 +589,11 @@ struct whisper_state { int lang_id = 0; // english by default + std::string path_model; // populated by whisper_init_from_file() +#ifdef WHISPER_USE_COREML + whisper_coreml_context * ctx_coreml; +#endif + // [EXPERIMENTAL] token-level timestamps data int64_t t_beg = 0; int64_t t_last = 0; @@ -1376,6 +1384,7 @@ static bool whisper_encode_internal( } } +#ifndef WHISPER_USE_COREML struct ggml_tensor * cur; // convolution + gelu @@ -1683,6 +1692,13 @@ static bool whisper_encode_internal( //ggml_graph_print(&gf); } +#else + wstate.use_buf(ctx0, -1); + + struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + + whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); +#endif // cur //{ @@ -2470,6 +2486,20 @@ static std::vector tokenize(const whisper_vocab & vocab, cons // interface implementation // +#ifdef WHISPER_USE_COREML +// replace .bin with -encoder.mlmodelc +static std::string whisper_get_coreml_path_encoder(std::string path_bin) { + auto pos = path_bin.rfind('.'); + if (pos != std::string::npos) { + path_bin = path_bin.substr(0, pos); + } + + path_bin += "-encoder.mlmodelc"; + + return path_bin; +} +#endif + struct whisper_state * whisper_init_state(whisper_context * ctx) { whisper_state * state = new whisper_state; @@ -2497,6 +2527,21 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } +#ifdef WHISPER_USE_COREML + const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model); + + fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str()); + fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__); + + state->ctx_coreml = whisper_coreml_init(path_coreml.c_str()); + if (!state->ctx_coreml) { + fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str()); + return nullptr; + } + + fprintf(stderr, "%s: Core ML model loaded\n", __func__); +#endif + state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx); state->logits_id.reserve(ctx->model.hparams.n_vocab); @@ -2531,6 +2576,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model } loader.context = &fin; + loader.read = [](void * ctx, void * output, size_t read_size) { std::ifstream * fin = (std::ifstream*)ctx; fin->read((char *)output, read_size); @@ -2663,6 +2709,11 @@ void whisper_free_state(struct whisper_state * state) kv_cache_free(state->decoders[i].kv_self); } +#ifdef WHISPER_USE_COREML + whisper_coreml_free(state->ctx_coreml); + state->ctx_coreml = nullptr; +#endif + delete state; } } @@ -3084,6 +3135,14 @@ void whisper_reset_timings(struct whisper_context * ctx) { } } +static int whisper_has_coreml(void) { +#ifdef WHISPER_USE_COREML + return 1; +#else + return 0; +#endif +} + const char * whisper_print_system_info(void) { static std::string s; @@ -3100,6 +3159,7 @@ const char * whisper_print_system_info(void) { s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; return s.c_str(); }