whisper : add Core ML support (#566)

* coreml : use Core ML encoder inference * coreml : simlpify whisper_encode + log messages * whisper : resolve rebase conflicts * coreml : add scripts for CoreML model generation * bench-all : recognize COREML flag
2023-04-15 13:21:27 +03:00 · 2023-04-15 13:21:27 +03:00 · 5e47e223bd
parent 794ff3074a
commit 5e47e223bd
15 changed files with 1404 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.o
 *.a
 .cache/
+.coreml/
 .test/
 .vs/
 .vscode/
@ -35,4 +36,6 @@ examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata

 extra/bench-gg.txt

-*.mlmodel*
+models/*.mlmodel
+models/*.mlmodelc
+models/*.mlpackage
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -58,6 +58,8 @@ if (APPLE)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
+
+    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -90,16 +92,33 @@ endif()

 find_package(Threads REQUIRED)

-# on APPLE - include Accelerate framework
-if (APPLE AND NOT WHISPER_NO_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
+# on APPLE
+if (APPLE)
+    # include Accelerate framework
+    if (NOT WHISPER_NO_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)

-        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-    else()
-        message(WARNING "Accelerate framework not found")
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
+
+            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
+    endif()
+
+    if (WHISPER_COREML)
+        find_library(FOUNDATION_FRAMEWORK Foundation)
+        find_library(COREML_FRAMEWORK CoreML)
+
+        if (COREML_FRAMEWORK)
+            message(STATUS "CoreML framework found")
+
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
+        else()
+            message(WARNING "CoreML framework not found")
+        endif()
    endif()
 endif()

@ -187,6 +206,33 @@ if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()

+#
+# whisper.coreml - Core ML support
+#
+
+if (WHISPER_COREML)
+    set(TARGET whisper.coreml)
+
+    add_library(${TARGET}
+        coreml/whisper-encoder.h
+        coreml/whisper-encoder.mm
+        coreml/whisper-encoder-impl.h
+        coreml/whisper-encoder-impl.m
+        )
+
+    include(DefaultTargetOptions)
+
+    target_include_directories(${TARGET} PUBLIC
+        .
+        )
+
+    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
+
+    set_target_properties(${TARGET} PROPERTIES
+        COMPILE_FLAGS "-fobjc-arc"
+        )
+endif()
+
 #
 # whisper - this is the main library of the project
 #
@ -206,6 +252,10 @@ target_include_directories(${TARGET} PUBLIC
    .
    )

+if (WHISPER_COREML)
+    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
+endif()
+
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

--- a/48
+++ b/48
@ -140,6 +140,10 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
+ifdef WHISPER_COREML
+	CXXFLAGS += -DWHISPER_USE_COREML
+	LDFLAGS  += -framework Foundation -framework CoreML
+endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@ -195,11 +199,23 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h ggml.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o

-libwhisper.a: ggml.o whisper.o
-	$(AR) rcs libwhisper.a ggml.o whisper.o
+ifndef WHISPER_COREML
+WHISPER_OBJ = whisper.o
+else
+whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
+	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o

-libwhisper.so: ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
+whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
+	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
+
+WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
+endif
+
+libwhisper.a: ggml.o $(WHISPER_OBJ)
+	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
+
+libwhisper.so: ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)

 clean:
 	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
@ -213,24 +229,24 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON = examples/common.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp

-main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
 	./main -h

-bench: examples/bench/bench.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
+bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)

-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)

-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)

-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)

 #
 # Audio samples
--- a/coreml/whisper-decoder-impl.h
+++ b/coreml/whisper-decoder-impl.h
@ -0,0 +1,146 @@
+//
+// whisper-decoder-impl.h
+//
+// This file was automatically generated and should not be edited.
+//
+
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#include <stdint.h>
+#include <os/log.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+
+/// Model Prediction Input Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_decoder_implInput : NSObject<MLFeatureProvider>
+
+/// token_data as 1 by 1 matrix of 32-bit integers
+@property (readwrite, nonatomic, strong) MLMultiArray * token_data;
+
+/// audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * audio_data;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Model Prediction Output Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
+
+/// var_1346 as multidimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Class for model loading and prediction
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_decoder_impl : NSObject
+@property (readonly, nonatomic, nullable) MLModel * model;
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
+ (nullable NSURL *)URLOfModelInThisBundle;
+
+/**
+    Initialize whisper_decoder_impl instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+*/
+- (nullable instancetype)init;
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of whisper_decoder_implInput to predict from
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_decoder_implOutput
+*/
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of whisper_decoder_implInput to predict from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_decoder_implOutput
+*/
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the convenience interface
+    @param token_data as 1 by 1 matrix of 32-bit integers:
+    @param audio_data as 1 × 384 × 1 × 1500 4-dimensional array of floats:
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_decoder_implOutput
+*/
+- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Batch prediction
+    @param inputArray array of whisper_decoder_implInput instances to obtain predictions from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the predictions as NSArray<whisper_decoder_implOutput *>
+*/
+- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+@end
+
+NS_ASSUME_NONNULL_END
--- a/coreml/whisper-decoder-impl.m
+++ b/coreml/whisper-decoder-impl.m
@ -0,0 +1,201 @@
+//
+// whisper-decoder-impl.m
+//
+// This file was automatically generated and should not be edited.
+//
+
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "whisper-decoder-impl.h"
+
+@implementation whisper_decoder_implInput
+
+- (instancetype)initWithToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data {
+    self = [super init];
+    if (self) {
+        _token_data = token_data;
+        _audio_data = audio_data;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"token_data", @"audio_data"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"token_data"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.token_data];
+    }
+    if ([featureName isEqualToString:@"audio_data"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.audio_data];
+    }
+    return nil;
+}
+
+@end
+
+@implementation whisper_decoder_implOutput
+
+- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
+    self = [super init];
+    if (self) {
+        _var_1346 = var_1346;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"var_1346"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"var_1346"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
+    }
+    return nil;
+}
+
+@end
+
+@implementation whisper_decoder_impl
+
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
+ (nullable NSURL *)URLOfModelInThisBundle {
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_decoder_impl" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-decoder-impl.mlmodelc in the bundle resource"); return nil; }
+    return [NSURL fileURLWithPath:assetPath];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of whisper_decoder_impl.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model {
+    self = [super init];
+    if (!self) { return nil; }
+    _model = model;
+    if (_model == nil) { return nil; }
+    return self;
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+*/
+- (nullable instancetype)init {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Initialize whisper_decoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_decoder_impl.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
+    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
+              configuration:configuration
+          completionHandler:handler];
+}
+
+
+/**
+    Construct whisper_decoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_decoder_impl instance or NSError object.
+*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_decoder_impl * _Nullable model, NSError * _Nullable error))handler {
+    [MLModel loadContentsOfURL:modelURL
+                 configuration:configuration
+             completionHandler:^(MLModel *model, NSError *error) {
+        if (model != nil) {
+            whisper_decoder_impl *typedModel = [[whisper_decoder_impl alloc] initWithMLModel:model];
+            handler(typedModel, nil);
+        } else {
+            handler(nil, error);
+        }
+    }];
+}
+
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
+}
+
+- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
+    if (!outFeatures) { return nil; }
+    return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
+}
+
+- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    whisper_decoder_implInput *input_ = [[whisper_decoder_implInput alloc] initWithToken_data:token_data audio_data:audio_data];
+    return [self predictionFromFeatures:input_ error:error];
+}
+
+- (nullable NSArray<whisper_decoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_decoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
+    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
+    if (!outBatch) { return nil; }
+    NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    for (NSInteger i = 0; i < outBatch.count; i++) {
+        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
+        whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
+        [results addObject:result];
+    }
+    return results;
+}
+
+@end
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -0,0 +1,142 @@
+//
+// whisper-encoder-impl.h
+//
+// This file was automatically generated and should not be edited.
+//
+
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>
+#include <stdint.h>
+#include <os/log.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+
+/// Model Prediction Input Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_encoder_implInput : NSObject<MLFeatureProvider>
+
+/// logmel_data as 1 × 80 × 3000 3-dimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * logmel_data;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Model Prediction Output Type
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_encoder_implOutput : NSObject<MLFeatureProvider>
+
+/// output as multidimensional array of floats
+@property (readwrite, nonatomic, strong) MLMultiArray * output;
+- (instancetype)init NS_UNAVAILABLE;
+- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
+
+@end
+
+
+/// Class for model loading and prediction
+API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
+@interface whisper_encoder_impl : NSObject
+@property (readonly, nonatomic, nullable) MLModel * model;
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
+ (nullable NSURL *)URLOfModelInThisBundle;
+
+/**
+    Initialize whisper_encoder_impl instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
+
+/**
+    Initialize whisper_encoder_impl instance with the model in this bundle.
+*/
+- (nullable instancetype)init;
+
+/**
+    Initialize whisper_encoder_impl instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize whisper_encoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Initialize whisper_encoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Construct whisper_encoder_impl instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
+*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
+*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of whisper_encoder_implInput to predict from
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_encoder_implOutput
+*/
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the standard interface
+    @param input an instance of whisper_encoder_implInput to predict from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_encoder_implOutput
+*/
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Make a prediction using the convenience interface
+    @param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the prediction as whisper_encoder_implOutput
+*/
+- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+
+/**
+    Batch prediction
+    @param inputArray array of whisper_encoder_implInput instances to obtain predictions from
+    @param options prediction options
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+    @return the predictions as NSArray<whisper_encoder_implOutput *>
+*/
+- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
+@end
+
+NS_ASSUME_NONNULL_END
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@ -0,0 +1,197 @@
+//
+// whisper-encoder-impl.m
+//
+// This file was automatically generated and should not be edited.
+//
+
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "whisper-encoder-impl.h"
+
+@implementation whisper_encoder_implInput
+
+- (instancetype)initWithLogmel_data:(MLMultiArray *)logmel_data {
+    self = [super init];
+    if (self) {
+        _logmel_data = logmel_data;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"logmel_data"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"logmel_data"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.logmel_data];
+    }
+    return nil;
+}
+
+@end
+
+@implementation whisper_encoder_implOutput
+
+- (instancetype)initWithOutput:(MLMultiArray *)output {
+    self = [super init];
+    if (self) {
+        _output = output;
+    }
+    return self;
+}
+
+- (NSSet<NSString *> *)featureNames {
+    return [NSSet setWithArray:@[@"output"]];
+}
+
+- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
+    if ([featureName isEqualToString:@"output"]) {
+        return [MLFeatureValue featureValueWithMultiArray:self.output];
+    }
+    return nil;
+}
+
+@end
+
+@implementation whisper_encoder_impl
+
+
+/**
+    URL of the underlying .mlmodelc directory.
+*/
+ (nullable NSURL *)URLOfModelInThisBundle {
+    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"whisper_encoder_impl" ofType:@"mlmodelc"];
+    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load whisper-encoder-impl.mlmodelc in the bundle resource"); return nil; }
+    return [NSURL fileURLWithPath:assetPath];
+}
+
+
+/**
+    Initialize whisper_encoder_impl instance from an existing MLModel object.
+
+    Usually the application does not use this initializer unless it makes a subclass of whisper_encoder_impl.
+    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
+*/
+- (instancetype)initWithMLModel:(MLModel *)model {
+    self = [super init];
+    if (!self) { return nil; }
+    _model = model;
+    if (_model == nil) { return nil; }
+    return self;
+}
+
+
+/**
+    Initialize whisper_encoder_impl instance with the model in this bundle.
+*/
+- (nullable instancetype)init {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
+}
+
+
+/**
+    Initialize whisper_encoder_impl instance with the model in this bundle.
+
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
+}
+
+
+/**
+    Initialize whisper_encoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Initialize whisper_encoder_impl instance from the model URL.
+
+    @param modelURL URL to the .mlmodelc directory for whisper_encoder_impl.
+    @param configuration The model configuration object
+    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
+*/
+- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
+    if (model == nil) { return nil; }
+    return [self initWithMLModel:model];
+}
+
+
+/**
+    Construct whisper_encoder_impl instance asynchronously with configuration.
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
+*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
+    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
+              configuration:configuration
+          completionHandler:handler];
+}
+
+
+/**
+    Construct whisper_encoder_impl instance asynchronously with URL of .mlmodelc directory and optional configuration.
+
+    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
+
+    @param modelURL The model URL.
+    @param configuration The model configuration
+    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid whisper_encoder_impl instance or NSError object.
+*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(whisper_encoder_impl * _Nullable model, NSError * _Nullable error))handler {
+    [MLModel loadContentsOfURL:modelURL
+                 configuration:configuration
+             completionHandler:^(MLModel *model, NSError *error) {
+        if (model != nil) {
+            whisper_encoder_impl *typedModel = [[whisper_encoder_impl alloc] initWithMLModel:model];
+            handler(typedModel, nil);
+        } else {
+            handler(nil, error);
+        }
+    }];
+}
+
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
+}
+
+- (nullable whisper_encoder_implOutput *)predictionFromFeatures:(whisper_encoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
+    if (!outFeatures) { return nil; }
+    return [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
+}
+
+- (nullable whisper_encoder_implOutput *)predictionFromLogmel_data:(MLMultiArray *)logmel_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    whisper_encoder_implInput *input_ = [[whisper_encoder_implInput alloc] initWithLogmel_data:logmel_data];
+    return [self predictionFromFeatures:input_ error:error];
+}
+
+- (nullable NSArray<whisper_encoder_implOutput *> *)predictionsFromInputs:(NSArray<whisper_encoder_implInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
+    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
+    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
+    if (!outBatch) { return nil; }
+    NSMutableArray<whisper_encoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
+    for (NSInteger i = 0; i < outBatch.count; i++) {
+        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
+        whisper_encoder_implOutput * result = [[whisper_encoder_implOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
+        [results addObject:result];
+    }
+    return results;
+}
+
+@end
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -0,0 +1,22 @@
+// Wrapper of the Core ML Whisper Encoder model
+//
+// Code is derived from the work of Github user @wangchou
+// ref: https://github.com/wangchou/callCoreMLFromCpp
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_coreml_context;
+
+struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
+void whisper_coreml_free(struct whisper_coreml_context * ctx);
+
+void whisper_coreml_encode(
+        const whisper_coreml_context * ctx,
+                               float * mel,
+                               float * out);
+
+#if __cplusplus
+}
+#endif
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -0,0 +1,67 @@
+#import "coreml/whisper-encoder.h"
+#import "coreml/whisper-encoder-impl.h"
+
+#import <CoreML/CoreML.h>
+
+#include <stdlib.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_coreml_context {
+    const void * data;
+};
+
+struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
+    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
+
+    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
+
+    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
+
+    if (data == NULL) {
+        return NULL;
+    }
+
+    whisper_coreml_context * ctx = new whisper_coreml_context;
+
+    ctx->data = data;
+
+    return ctx;
+}
+
+void whisper_coreml_free(struct whisper_coreml_context * ctx) {
+    CFRelease(ctx->data);
+    delete ctx;
+}
+
+void whisper_coreml_encode(
+        const whisper_coreml_context * ctx,
+                               float * mel,
+                               float * out) {
+    MLMultiArray * inMultiArray = [
+        [MLMultiArray alloc] initWithDataPointer: mel
+                                           shape: @[@1, @80, @3000]
+                                        dataType: MLMultiArrayDataTypeFloat32
+                                         strides: @[@(240000), @(3000), @1]
+                                     deallocator: nil
+                                           error: nil
+    ];
+
+    whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
+
+    MLMultiArray * outMA = outCoreML.output;
+
+    //NSArray<NSNumber *> * shape = outMA.shape;
+    //NSArray<NSNumber *> * strides = outMA.strides;
+
+    //printf("shape:   %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
+    //printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
+
+    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
+}
+
+#if __cplusplus
+}
+#endif
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -64,6 +64,10 @@ for model in "${models[@]}"; do
        config="$config BLAS"
    fi

+    if [[ $system_info == *"COREML = 1"* ]]; then
+        config="$config COREML"
+    fi
+
    commit=$(git rev-parse --short HEAD)

    printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -0,0 +1,334 @@
+import argparse
+import torch
+import torch.nn.functional as F
+import coremltools as ct
+
+from torch import Tensor
+from torch import nn
+from typing import Dict
+from typing import Optional
+from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
+from coremltools.models.neural_network.quantization_utils import quantize_weights
+from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
+from whisper import load_model
+
+# Use for changing dim of input in encoder and decoder embeddings
+def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
+                         missing_keys, unexpected_keys, error_msgs):
+    """
+    Unsqueeze twice to map nn.Linear weights to nn.Conv2d weights
+    """
+    for k in state_dict:
+        is_attention = all(substr in k for substr in ['attn', '.weight'])
+        is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
+
+        if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
+            state_dict[k] = state_dict[k][:, :, None, None]
+
+
+def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
+                                           strict, missing_keys,
+                                           unexpected_keys, error_msgs):
+    state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
+    return state_dict
+
+class LayerNormANE(LayerNormANEBase):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._register_load_state_dict_pre_hook(
+            correct_for_bias_scale_order_inversion)
+
+class MultiHeadAttentionANE(MultiHeadAttention):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__(n_state, n_head)
+
+        setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
+        setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
+        setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
+        setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
+
+    def forward(self,
+                x: Tensor,
+                xa: Optional[Tensor] = None,
+                mask: Optional[Tensor] = None,
+                kv_cache: Optional[dict] = None):
+
+        q = self.query(x)
+
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+
+        wv, qk = self.qkv_attention_ane(q, k, v, mask)
+
+        return self.out(wv), qk
+
+    def qkv_attention_ane(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None):
+
+        _, dim, _, seqlen = q.size()
+
+        dim_per_head = dim // self.n_head
+
+        scale = float(dim_per_head)**-0.5
+
+        q = q * scale
+
+        mh_q = q.split(dim_per_head, dim=1)
+        mh_k = k.transpose(1,3).split(dim_per_head, dim=3)
+        mh_v = v.split(dim_per_head, dim=1)
+
+        mh_qk = [
+            torch.einsum('bchq,bkhc->bkhq', [qi, ki])
+            for qi, ki in zip(mh_q, mh_k)
+        ]  # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
+
+        if mask is not None:
+            for head_idx in range(self.n_head):
+                mh_qk[head_idx] = mh_qk[head_idx] + mask[:, :seqlen, :, :seqlen]
+
+        attn_weights = [aw.softmax(dim=1) for aw in mh_qk]  # (batch_size, max_seq_length, 1, max_seq_length) * n_heads
+        attn = [torch.einsum('bkhq,bchk->bchq', wi, vi) for wi, vi in zip(attn_weights, mh_v)]  # (batch_size, dim_per_head, 1, max_seq_length) * n_heads
+        attn = torch.cat(attn, dim=1)  # (batch_size, dim, 1, max_seq_length)
+
+        return attn, torch.cat(mh_qk, dim=1).float().detach()
+
+
+class ResidualAttentionBlockANE(ResidualAttentionBlock):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__(n_state, n_head, cross_attention)
+
+        setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
+        setattr(self, 'attn_ln', LayerNormANE(n_state))
+
+        setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
+        setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
+
+        n_mlp = n_state * 4
+        setattr(self, 'mlp', nn.Sequential(
+            nn.Conv2d(n_state, n_mlp, kernel_size=1),
+            nn.GELU(),
+            nn.Conv2d(n_mlp, n_state, kernel_size=1)
+        ))
+        setattr(self, 'mlp_ln', LayerNormANE(n_state))
+
+
+class AudioEncoderANE(AudioEncoder):
+    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
+
+        setattr(self, 'blocks', nn.ModuleList(
+            [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
+        ))
+        setattr(self, 'ln_post', LayerNormANE(n_state))
+
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+
+        assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
+
+        # Add positional embedding and add dummy dim for ANE
+        x = (x + self.positional_embedding.transpose(0,1)).to(x.dtype).unsqueeze(2)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.ln_post(x)
+
+        # """
+        # TODO:
+        # I think we need to transpose the result here to make it fit whisper.cpp memory order.
+        # However, even doing this, the results are still wrong. Kind of less wrong compared to
+        # not transposing, but still wrong.
+
+        # Also, I don't know why the original OpenAI implementation does not need to transpose
+
+        # transpose to (batch_size, n_ctx, n_state)
+        # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
+
+        # """
+        # x = x.transpose(1,3)
+
+        return x
+
+class TextDecoderANE(TextDecoder):
+
+    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
+        super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
+
+        setattr(self, 'blocks', nn.ModuleList(
+            [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
+        ))
+        setattr(self, 'ln', LayerNormANE(n_state))
+
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[3] if kv_cache else 0
+        x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]]
+        x = x.to(xa.dtype)
+
+        # Reformat for ANE
+        mask = self.mask[None, None, :, :].permute(0,3,1,2)
+        x = x.transpose(1,2).unsqueeze(2)
+
+        for block in self.blocks:
+            x = block(x, xa, mask=mask, kv_cache=kv_cache)
+
+        x = self.ln(x)
+
+        # Reformat back from ANE
+        x = x.permute(0,2,3,1).squeeze(0)
+
+        # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
+        if self.token_embedding.weight.shape[0] == 51865:
+            # split in 11 chunks - 4715 each
+            splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
+            logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
+        else:
+            # split in 12 chunks - 4322 each
+            assert(self.token_embedding.weight.shape[0] == 51864)
+            splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//12, dim=0)
+            logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
+
+        return logits
+
+class WhisperANE(Whisper):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__(dims)
+
+        setattr(self, 'encoder', AudioEncoderANE(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        ))
+        setattr(self, 'decoder', TextDecoderANE(
+            self.dims.n_vocab,
+            self.dims.n_text_ctx,
+            self.dims.n_text_state,
+            self.dims.n_text_head,
+            self.dims.n_text_layer,
+        ))
+
+        self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
+
+    def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]:
+        return self.decoder(tokens, self.encoder(mel))
+
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[3] > self.decoder.positional_embedding.shape[0]:
+                cache[module] = output  # save as-is, for the first token or cross attention
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=3).detach()
+            return cache[module]
+
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttentionANE):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+
+def convert_encoder(hparams, model, quantize=False):
+    model.eval()
+
+    input_shape = (1, 80, 3000)
+    input_data = torch.randn(input_shape)
+    traced_model = torch.jit.trace(model, input_data)
+
+    model = ct.convert(
+        traced_model,
+        convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
+        inputs=[ct.TensorType(name="logmel_data", shape=input_shape)],
+        outputs=[ct.TensorType(name="output")],
+        compute_units=ct.ComputeUnit.ALL
+    )
+
+    if quantize:
+        model = quantize_weights(model, nbits=16)
+
+    return model
+
+def convert_decoder(hparams, model, quantize=False):
+    model.eval()
+
+    tokens_shape = (1, 1)
+    audio_shape = (1, hparams.n_audio_state, 1, 1500)
+
+    audio_data = torch.randn(audio_shape)
+    token_data = torch.randint(50257, tokens_shape).long()
+    traced_model = torch.jit.trace(model, (token_data, audio_data))
+
+    model = ct.convert(
+        traced_model,
+        convert_to=None if quantize else "mlprogram", # convert will fail if weights are quantized, not sure why
+        inputs=[
+            ct.TensorType(name="token_data", shape=tokens_shape, dtype=int),
+            ct.TensorType(name="audio_data", shape=audio_shape)
+        ]
+    )
+
+    if quantize:
+        model = quantize_weights(model, nbits=16)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large)", required=True)
+    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
+    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
+    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
+    args = parser.parse_args()
+
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"]:
+        raise ValueError("Invalid model name")
+
+    whisper = load_model(args.model).cpu()
+    hparams = whisper.dims
+    print(hparams)
+
+    if args.optimize_ane:
+        whisperANE = WhisperANE(hparams).eval()
+        whisperANE.load_state_dict(whisper.state_dict())
+
+        encoder = whisperANE.encoder
+        decoder = whisperANE.decoder
+    else:
+        encoder = whisper.encoder
+        decoder = whisper.decoder
+
+    # Convert encoder
+    encoder = convert_encoder(hparams, encoder, quantize=args.quantize)
+    encoder.save(f"models/coreml-encoder-{args.model}.mlpackage")
+
+    if args.encoder_only is False:
+        # Convert decoder
+        decoder = convert_decoder(hparams, decoder, quantize=args.quantize)
+        decoder.save(f"models/coreml-decoder-{args.model}.mlpackage")
+
+    print("done converting")
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -0,0 +1,82 @@
+#!/bin/bash
+
+# This script downloads Whisper model files that have already been converted to Core ML format.
+# This way you don't have to convert them yourself.
+
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
+pfx="resolve/main/ggml"
+
+# get the path of this script
+function get_script_path() {
+    if [ -x "$(command -v realpath)" ]; then
+        echo "$(dirname $(realpath $0))"
+    else
+        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
+        echo "$ret"
+    fi
+}
+
+models_path="$(get_script_path)"
+
+# Whisper models
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+
+# list available models
+function list_models {
+    printf "\n"
+    printf "  Available models:"
+    for model in "${models[@]}"; do
+        printf " $model"
+    done
+    printf "\n\n"
+}
+
+if [ "$#" -ne 1 ]; then
+    printf "Usage: $0 <model>\n"
+    list_models
+
+    exit 1
+fi
+
+model=$1
+
+if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
+    printf "Invalid model: $model\n"
+    list_models
+
+    exit 1
+fi
+
+# download Core ML model
+
+printf "Downloading Core ML model $model from '$src' ...\n"
+
+cd $models_path
+
+if [ -f "ggml-$model.mlmodel" ]; then
+    printf "Model $model already exists. Skipping download.\n"
+    exit 0
+fi
+
+if [ -x "$(command -v wget)" ]; then
+    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
+elif [ -x "$(command -v curl)" ]; then
+    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
+else
+    printf "Either wget or curl is required to download models.\n"
+    exit 1
+fi
+
+
+if [ $? -ne 0 ]; then
+    printf "Failed to download Core ML model $model \n"
+    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
+    exit 1
+fi
+
+printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
+printf "Run the following command to compile it:\n\n"
+printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
+printf "You can now use it like this:\n\n"
+printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
+printf "\n"
--- a/models/generate-coreml-interface.sh
+++ b/models/generate-coreml-interface.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+#
+# This generates:
+#   - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
+#   - coreml/whisper-decoder-impl.h and coreml/whisper-decoder-impl.m
+#
+
+wd=$(dirname "$0")
+cd "$wd/../"
+
+python3 models/convert-whisper-to-coreml.py --model tiny.en
+
+mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
+xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
+mv coreml/whisper_encoder_impl.h coreml/whisper-encoder-impl.h
+mv coreml/whisper_encoder_impl.m coreml/whisper-encoder-impl.m
+sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.m
+sed -i '' 's/whisper_encoder_impl\.m/whisper-encoder-impl.m/g' coreml/whisper-encoder-impl.m
+sed -i '' 's/whisper_encoder_impl\.h/whisper-encoder-impl.h/g' coreml/whisper-encoder-impl.h
+
+mv -v models/coreml-decoder-tiny.en.mlpackage models/whisper-decoder-impl.mlpackage
+xcrun coremlc generate models/whisper-decoder-impl.mlpackage coreml/
+mv coreml/whisper_decoder_impl.h coreml/whisper-decoder-impl.h
+mv coreml/whisper_decoder_impl.m coreml/whisper-decoder-impl.m
+sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.m
+sed -i '' 's/whisper_decoder_impl\.m/whisper-decoder-impl.m/g' coreml/whisper-decoder-impl.m
+sed -i '' 's/whisper_decoder_impl\.h/whisper-decoder-impl.h/g' coreml/whisper-decoder-impl.h
+
+rm -rfv models/whisper-encoder-impl.mlpackage models/whisper-decoder-impl.mlpackage
--- a/models/generate-coreml-model.sh
+++ b/models/generate-coreml-model.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Usage: ./generate-coreml-model.sh <model-name>
+if [ $# -eq 0 ]
+  then
+    echo "No model name supplied"
+    echo "Usage: ./generate-coreml-model.sh <model-name>"
+    exit 1
+fi
+
+mname="$1"
+
+wd=$(dirname "$0")
+cd "$wd/../"
+
+python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+
+xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
+rm -rf models/ggml-${mname}-encoder.mlmodelc
+mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
+
+# TODO: decoder (sometime in the future maybe)
+#xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
+#rm -rf models/ggml-${mname}-decoder.mlmodelc
+#mv -v models/coreml_decoder_${mname}.mlmodelc models/ggml-${mname}-decoder.mlmodelc
--- a/whisper.cpp
+++ b/whisper.cpp
@ -1,5 +1,8 @@
 #define WHISPER_BUILD
 #include "whisper.h"
+#if WHISPER_USE_COREML
+#include "coreml/whisper-encoder.h"
+#endif

 #include "ggml.h"

@ -586,6 +589,11 @@ struct whisper_state {

    int lang_id = 0; // english by default

+    std::string path_model; // populated by whisper_init_from_file()
+#ifdef WHISPER_USE_COREML
+    whisper_coreml_context * ctx_coreml;
+#endif
+
    // [EXPERIMENTAL] token-level timestamps data
    int64_t t_beg = 0;
    int64_t t_last = 0;
@ -1376,6 +1384,7 @@ static bool whisper_encode_internal(
        }
    }

+#ifndef WHISPER_USE_COREML
    struct ggml_tensor * cur;

    // convolution + gelu
@ -1683,6 +1692,13 @@ static bool whisper_encode_internal(

        //ggml_graph_print(&gf);
    }
+#else
+    wstate.use_buf(ctx0, -1);
+
+    struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+
+    whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
+#endif

    // cur
    //{
@ -2470,6 +2486,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
 // interface implementation
 //

+#ifdef WHISPER_USE_COREML
+// replace .bin with -encoder.mlmodelc
+static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
+    auto pos = path_bin.rfind('.');
+    if (pos != std::string::npos) {
+        path_bin = path_bin.substr(0, pos);
+    }
+
+    path_bin += "-encoder.mlmodelc";
+
+    return path_bin;
+}
+#endif
+
 struct whisper_state * whisper_init_state(whisper_context * ctx) {
    whisper_state * state = new whisper_state;

@ -2497,6 +2527,21 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
        fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }

+#ifdef WHISPER_USE_COREML
+    const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
+
+    fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+    fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
+
+    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
+    if (!state->ctx_coreml) {
+        fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+        return nullptr;
+    }
+
+    fprintf(stderr, "%s: Core ML model loaded\n", __func__);
+#endif
+
    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);

    state->logits_id.reserve(ctx->model.hparams.n_vocab);
@ -2531,6 +2576,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
    }

    loader.context = &fin;
+
    loader.read = [](void * ctx, void * output, size_t read_size) {
        std::ifstream * fin = (std::ifstream*)ctx;
        fin->read((char *)output, read_size);
@ -2663,6 +2709,11 @@ void whisper_free_state(struct whisper_state * state)
            kv_cache_free(state->decoders[i].kv_self);
        }

+#ifdef WHISPER_USE_COREML
+        whisper_coreml_free(state->ctx_coreml);
+        state->ctx_coreml = nullptr;
+#endif
+
        delete state;
    }
 }
@ -3084,6 +3135,14 @@ void whisper_reset_timings(struct whisper_context * ctx) {
    }
 }

+static int whisper_has_coreml(void) {
+#ifdef WHISPER_USE_COREML
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 const char * whisper_print_system_info(void) {
    static std::string s;

@ -3100,6 +3159,7 @@ const char * whisper_print_system_info(void) {
    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";

    return s.c_str();
 }