From 86870114eb14c459533034a1dd6f9b722cea0f9f Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Tue, 31 Dec 2024 09:29:41 +0800
Subject: [PATCH] [CoreML] support coreml model cache (#23065)

### Description
Refactor compute plan profiling

Support cache coreml model to speed up session initialization. this is
only support by user provided entry and user responsible to manage the
cache


With the cache, session initialization time can be reduced by 50% or
more:
|model| before| after|
|--|--|--|
|yolo11.onnx| 0.6s|0.1s|
|yolo11-fp16.onnx|1.8s|0.1s|


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: wejoncy <wejoncy@.com>
Co-authored-by: Scott McKay <skottmckay@gmail.com>
---
 .../coreml/coreml_provider_factory.h          |  16 ++
 onnxruntime/core/platform/env.h               |   2 +
 onnxruntime/core/platform/posix/env.cc        |   8 +
 onnxruntime/core/platform/windows/env.cc      |  10 +
 onnxruntime/core/platform/windows/env.h       |   2 +
 .../coreml/builders/model_builder.cc          |  92 ++++++-
 .../providers/coreml/builders/model_builder.h |   6 +-
 .../coreml/coreml_execution_provider.cc       |  30 ++-
 .../core/providers/coreml/coreml_options.cc   |   4 +
 .../core/providers/coreml/coreml_options.h    |   8 +
 .../core/providers/coreml/model/model.mm      | 225 ++++++++++++------
 .../test/perftest/command_args_parser.cc      |   1 +
 onnxruntime/test/perftest/ort_test_session.cc |   4 +-
 .../providers/coreml/coreml_basic_test.cc     | 113 ++++++++-
 14 files changed, 434 insertions(+), 87 deletions(-)

diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index d035fd34bd072..351eafc2a4675 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -61,6 +61,22 @@ static const char* const kCoremlProviderOption_SpecializationStrategy = "Special
 static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
 // please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
 static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
+// Specify the directory to cache any CoreML models created from the ONNX model in.
+// CoreML EP will convert onnx subgraph to CoreML model and save to disk.
+// If this path is not specified, the model will be saved to a temp directory and deleted after the session is closed.
+// otherwise, the model will be saved to the specified path and User should manage to delete the model.
+
+// we do NOT detect if the onnx model has changed and no longer matches the cached model.
+// the user should carefully manage the cache if modifying/replacing a model.
+// The cache key is generated by
+//    1. User provided key in metadata_props if found (preferred)
+//    2. Hash of the model url the inference session was created with
+//    3. Hash of the input/output names of the model
+// Please find out how to set metadata_props in the onnxruntime API documentation. https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#configuration-options
+static const char* const kCoremlProviderOption_ModelCacheDirectory = "ModelCacheDirectory";
+
+// User provided cache-key in metadata_props.
+static const char* const kCOREML_CACHE_KEY = "COREML_CACHE_KEY";
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h
index c42b31e64d129..7dbc3fe82db47 100644
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@@ -197,6 +197,7 @@ class Env {
 #ifdef _WIN32
   /// \brief Returns true if the directory exists.
   virtual bool FolderExists(const std::wstring& path) const = 0;
+  virtual bool FileExists(const std::wstring& path) const = 0;
   /// \brief Recursively creates the directory, if it doesn't exist.
   virtual common::Status CreateFolder(const std::wstring& path) const = 0;
   // Mainly for use with protobuf library
@@ -206,6 +207,7 @@ class Env {
 #endif
   /// \brief Returns true if the directory exists.
   virtual bool FolderExists(const std::string& path) const = 0;
+  virtual bool FileExists(const std::string& path) const = 0;
   /// \brief Recursively creates the directory, if it doesn't exist.
   virtual common::Status CreateFolder(const std::string& path) const = 0;
   // Recursively deletes the directory and its contents.
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index 04cf5ff6a3329..94aadf3df4d7e 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -471,6 +471,14 @@ class PosixEnv : public Env {
     return S_ISDIR(sb.st_mode);
   }
 
+  bool FileExists(const std::string& path) const override {
+    struct stat sb;
+    if (stat(path.c_str(), &sb)) {
+      return false;
+    }
+    return S_ISREG(sb.st_mode);
+  }
+
   common::Status CreateFolder(const std::string& path) const override {
     size_t pos = 0;
     do {
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 73319cd9c9b1c..4fccad6dfeb37 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -483,6 +483,16 @@ bool WindowsEnv::FolderExists(const std::string& path) const {
   return (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY);
 }
 
+bool WindowsEnv::FileExists(const std::wstring& path) const {
+  DWORD attributes = GetFileAttributesW(path.c_str());
+  return (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_NORMAL);
+}
+
+bool WindowsEnv::FileExists(const std::string& path) const {
+  DWORD attributes = GetFileAttributesA(path.c_str());
+  return (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_NORMAL);
+}
+
 common::Status WindowsEnv::CreateFolder(const std::wstring& path) const {
   size_t pos = 0;
   do {
diff --git a/onnxruntime/core/platform/windows/env.h b/onnxruntime/core/platform/windows/env.h
index 395aface1d809..05b92bb6a21eb 100644
--- a/onnxruntime/core/platform/windows/env.h
+++ b/onnxruntime/core/platform/windows/env.h
@@ -68,6 +68,8 @@ class WindowsEnv : public Env {
                            MappedMemoryPtr& mapped_memory) const override;
   bool FolderExists(const std::wstring& path) const override;
   bool FolderExists(const std::string& path) const override;
+  bool FileExists(const std::wstring& path) const override;
+  bool FileExists(const std::string& path) const override;
   common::Status CreateFolder(const std::wstring& path) const override;
   common::Status CreateFolder(const std::string& path) const override;
   common::Status DeleteFolder(const PathString& path) const override;
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 6486942199df7..f8952301d59a9 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -390,13 +390,66 @@ void CreateEmptyFile(const std::string& filename) {
 
 #endif  // defined(COREML_ENABLE_MLPROGRAM)
 
-std::string GetModelOutputPath(bool create_ml_program) {
-  // path is used to create the ML Package directory for ML Program, and for the model directly otherwise.
-  auto path = util::GetTemporaryFilePath();
-  if (!create_ml_program) {
-    path += ".model.mlmodel";
-  }
+std::string GetModelOutputPath(const CoreMLOptions& coreml_options,
+                               const GraphViewer& graph_viewer,
+                               const logging::Logger& logger) {
+  const std::string& subgraph_name = graph_viewer.Name();
+  std::string path;
+  if (coreml_options.ModelCacheDirectory().empty()) {
+    // path is used to create the ML Package directory for ML Program, and for the model directly otherwise.
+    path = util::GetTemporaryFilePath();
+    if (!coreml_options.CreateMLProgram()) {
+      path += ".model.mlmodel";
+    }
+  } else {
+    // subgraph_name is uniquely generated by
+    // onnxruntime/core/providers/coreml/coreml_execution_provider.cc::gen_metadef_name
+    // int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
+    // MakeString(user_provide_key, "_", COREML, "_", model_hash, "_", metadef_id);
+    std::string_view cache_key = std::string_view(subgraph_name)
+                                     .substr(0, subgraph_name.find_first_of("_"));
+    // subgraph_short_name is metadef_id
+    std::string_view subgraph_short_name = std::string_view(subgraph_name)
+                                               .substr(subgraph_name.find_last_of("_") + 1);
+    path = MakeString(std::string(coreml_options.ModelCacheDirectory()), "/", cache_key);
+
+    std::string model_file_path = path + "/model.txt";
+
+    path = MakeString(path, "/", subgraph_short_name);
+    // Set the model cache path with setting of RequireStaticShape and ModelFormat
+    if (coreml_options.RequireStaticShape()) {
+      path += "_static";
+    } else {
+      path += "_dynamic";
+    }
 
+    if (coreml_options.CreateMLProgram()) {
+      path += "_mlprogram";
+    } else {
+      path += "_nn";
+    }
+    if (!Env::Default().CreateFolder(path).IsOK()) {
+      LOGS(logger, ERROR) << "Failed to create cache directory `" << path << "`. Model caching is disabled.";
+      coreml_options.DisableModelCache();
+      return GetModelOutputPath(coreml_options, graph_viewer, logger);
+    }
+    path += "/model";
+    // Write the model path to a file in the cache directory.
+    // This is for developers to know what the cached model is as we used a hash for the directory name.
+    if (!Env::Default().FileExists(ToPathString(model_file_path))) {
+      const Graph* main_graph = &graph_viewer.GetGraph();
+      while (main_graph->IsSubgraph()) {
+        main_graph = main_graph->ParentGraph();
+      }
+      std::ofstream file(model_file_path);
+      if (!file.is_open()) {
+        LOGS(logger, ERROR) << "Failed to open file " << model_file_path;
+      } else {
+        file << main_graph->ModelPath().string();
+        file.close();
+      }
+    }
+  }
   return path;
 }
 }  // namespace
@@ -410,10 +463,21 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
       coreml_version_(coreml_version),
       coreml_options_(coreml_options),
       create_ml_program_(coreml_options.CreateMLProgram()),
-      model_output_path_(GetModelOutputPath(create_ml_program_)),
+      model_output_path_(GetModelOutputPath(coreml_options_, graph_viewer, logger)),  // coreml_options_ must be set before this
       onnx_input_names_(std::move(onnx_input_names)),
       onnx_output_names_(std::move(onnx_output_names)),
       coreml_model_(std::make_unique<CoreML::Specification::Model>()) {
+  // GetTemporaryFilePath() always produce a unique path for the model and this is not existed
+  // Mlprogram will create a folder while NN create a file
+  if (Env::Default().FolderExists(ToPathString(model_output_path_)) ||
+      Env::Default().FileExists(ToPathString(model_output_path_))) {
+    is_model_cached_ = true;
+    LOGS(logger, INFO) << "Model is already cached in " << model_output_path_
+                       << " and will be reused. If you want to update the model or hit other issues, "
+                       << "please consider to clear the cache and retry.";
+    return;
+  }
+
   if (create_ml_program_) {
 #if defined(COREML_ENABLE_MLPROGRAM)
     coreml_model_->set_specificationversion(CoreMLSpecVersion());
@@ -847,6 +911,10 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   input_output_info_.emplace(name, OnnxTensorInfo{data_type, shape});
 
+  if (IsModelCached()) {
+    return Status::OK();
+  }
+
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     if (is_input) {
@@ -1056,8 +1124,14 @@ Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logge
   ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_options,
                        std::move(onnx_input_names), std::move(onnx_output_names));
 
-  ORT_RETURN_IF_ERROR(builder.CreateModel());
-  ORT_RETURN_IF_ERROR(builder.SaveModel());
+  if (!builder.IsModelCached()) {
+    ORT_RETURN_IF_ERROR(builder.CreateModel());
+    ORT_RETURN_IF_ERROR(builder.SaveModel());
+  } else {
+    // runtime requires the input/output names to be passed
+    ORT_RETURN_IF_ERROR(builder.RegisterModelInputs());
+    ORT_RETURN_IF_ERROR(builder.RegisterModelOutputs());
+  }
 
   return builder.LoadModel(model);
 }
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index e19597cf0dc2e..28c7dc42da581 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -54,6 +54,7 @@ class ModelBuilder {
   // We only support CoreML 3 and later so the spec version is always version + 1.
   int32_t CoreMLVersion() const { return coreml_version_; }
   int32_t CoreMLSpecVersion() const { return coreml_version_ + 1; }
+  bool IsModelCached() const { return is_model_cached_; }
 
   // Returns true if we are creating an ML Program
   bool CreateMLProgram() const {
@@ -218,8 +219,9 @@ class ModelBuilder {
   const logging::Logger& logger_;
   const int32_t coreml_version_;
   CoreMLOptions coreml_options_;
-  const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
-  const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
+  const bool create_ml_program_;   // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
+  std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
+  bool is_model_cached_{false};
 
   std::vector<std::string> onnx_input_names_;
   std::vector<std::string> onnx_output_names_;
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 5a2867e5524e4..b6bb4f2c1d66a 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -18,6 +18,7 @@
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/model/model.h"
 #include "core/providers/coreml/shape_utils.h"
+#include "core/graph/model.h"
 
 namespace onnxruntime {
 
@@ -52,12 +53,37 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
   const auto builder_params = coreml::MakeOpBuilderParams(graph_viewer, coreml_version_,
                                                           coreml_options_.RequireStaticShape(), coreml_options_.CreateMLProgram());
   const auto supported_nodes = coreml::GetSupportedNodes(graph_viewer, builder_params, logger);
-
+  const Graph* main_graph = &graph_viewer.GetGraph();
+  while (main_graph->IsSubgraph()) {
+    main_graph = main_graph->ParentGraph();
+  }
+  const auto& metadata = main_graph->GetModel().MetaData();
+
+  std::string user_provided_key = metadata.count(kCOREML_CACHE_KEY) > 0
+                                      ? metadata.at(kCOREML_CACHE_KEY)
+                                      : "";
+  if (user_provided_key.size() > 64 ||
+      std::any_of(user_provided_key.begin(), user_provided_key.end(),
+                  [](unsigned char c) { return !std::isalnum(c); })) {
+    LOGS(logger, ERROR) << "[" << kCOREML_CACHE_KEY << ":" << user_provided_key << "] is not a valid cache key."
+                        << " It should be alphanumeric and less than 64 characters.";
+    user_provided_key = "";
+  }
   const auto gen_metadef_name =
       [&]() {
         HashValue model_hash;
         int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
-        return MakeString(COREML, "_", model_hash, "_", metadef_id);
+        // use model_hash as the key if user doesn't provide one
+        if (user_provided_key.empty()) {
+          // user passed a empty string
+          // model_hash is a 64-bit hash value of model_path if model_path is not empty,
+          // otherwise it hashes the graph input names and all the node output names.
+          // it can't guarantee the uniqueness of the key, so user should manager the key for the best.
+          user_provided_key = std::to_string(model_hash);
+        }
+        // The string format is used by onnxruntime/core/providers/coreml/builders/model_builder.cc::GetModelOutputPath
+        // If the format changes, the function should be updated accordingly.
+        return MakeString(user_provided_key, "_", COREML, "_", model_hash, "_", metadef_id);
       };
 
   result = utils::CreateSupportedPartitions(graph_viewer, supported_nodes, {},
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
index 4ec780208e528..14ae55de9266b 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.cc
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -5,6 +5,7 @@
 #include "core/providers/coreml/coreml_provider_factory.h"  // defines flags
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/platform/env.h"
 
 namespace onnxruntime {
 
@@ -71,6 +72,7 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
       kCoremlProviderOption_SpecializationStrategy,
       kCoremlProviderOption_ProfileComputePlan,
       kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
+      kCoremlProviderOption_ModelCacheDirectory,
   };
   // Validate the options
   for (const auto& option : options) {
@@ -103,6 +105,8 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
       profile_compute_plan_ = option.second == "1";
     } else if (kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU == option.first) {
       allow_low_precision_accumulation_on_gpu_ = option.second == "1";
+    } else if (kCoremlProviderOption_ModelCacheDirectory == option.first) {
+      model_cache_directory_ = option.second;
     }
   }
 }
diff --git a/onnxruntime/core/providers/coreml/coreml_options.h b/onnxruntime/core/providers/coreml/coreml_options.h
index fd05c96927bd1..d7ee04b3f8a79 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.h
+++ b/onnxruntime/core/providers/coreml/coreml_options.h
@@ -17,6 +17,9 @@ class CoreMLOptions {
   std::string strategy_;
   bool profile_compute_plan_{false};
   bool allow_low_precision_accumulation_on_gpu_{false};
+  // path to store the converted coreml model
+  // we may run DisableModelCache() to disable model caching
+  mutable std::string model_cache_directory_;
 
  public:
   explicit CoreMLOptions(uint32_t coreml_flags);
@@ -32,6 +35,11 @@ class CoreMLOptions {
   bool UseStrategy(std::string_view strategy) const { return strategy_ == strategy; }
   bool ProfileComputePlan() const { return profile_compute_plan_ && create_mlprogram_; }
 
+  std::string_view ModelCacheDirectory() const { return model_cache_directory_; }
+  // The options specified by the user are const, but if there's an error setting up caching we disable it
+  // so that the EP can still be used. The error is logged for the user to investigate.
+  void DisableModelCache() const { model_cache_directory_.clear(); }
+
  private:
   void ValidateAndParseProviderOption(const ProviderOptions& options);
 };
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 755dbfbd6e68c..5211b89ec17c6 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -301,53 +301,144 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
   return Status::OK();
 }
 
-// since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
-// We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
-// The macro API_AVAILABLE should also be fine.
+// since macos(14.4), ios(17.4), MLComputePlan is introduced in <CoreML/CoreML.h>
 // Otherwise, the compiler will complain `MLComputePlan` is not defined.
-// we define __clang_analyzer__ here is for bypass static analysis
+#if __has_include(<CoreML/MLComputePlan.h>)
+#define HAS_COREMLPLAN 1
+#else
+#define HAS_COREMLPLAN 0
+#endif
+
+#if HAS_COREMLPLAN
+API_AVAILABLE(macos(14.4), ios(17.4), tvos(17.4), watchos(10.4))
+void ProfileBlock(MLComputePlan* _Nullable computePlan, MLModelStructureProgramBlock* block) {
+  for (MLModelStructureProgramOperation* operation in block.operations) {
+    for (size_t i = 0; i < operation.blocks.count; ++i) {
+      ProfileBlock(computePlan, operation.blocks[i]);
+    }
+    // Get the compute device usage for the operation.
+    MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
+    id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
+    // Get the estimated cost of executing the operation.
+    MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
+    if (![operation.operatorName isEqualToString:@"const"]) {
+      NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
+    }
+  }
+}
+#endif
+
+// since macos(14.4), ios(17.4), MLComputePlan is introduced in <CoreML/CoreML.h>
+// Otherwise, the compiler will complain `MLComputePlan` is not defined.
+API_AVAILABLE(macos(14.4), ios(17.4), tvos(17.4), watchos(10.4))
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
-#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
-  if (@available(macOS 14.4, iOS 17.4, *)) {
-    [MLComputePlan loadContentsOfURL:compileUrl
-                       configuration:config
-                   completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
-                     if (!computePlan) {
-                       NSLog(@"Error loading compute plan: %@", error);
-                       // Handle error.
-                       return;
-                     }
-                     MLModelStructureProgram* program = computePlan.modelStructure.program;
-                     if (!program) {
-                       NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
-                       return;
-                     }
-
-                     MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
-                     if (!mainFunction) {
-                       NSLog(@"Error loading main function from program");
-                       return;
-                     }
-
-                     NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
-                     NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
-                     for (MLModelStructureProgramOperation* operation in operations) {
-                       // Get the compute device usage for the operation.
-                       MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
-                       id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
-                       // Get the estimated cost of executing the operation.
-                       MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
-                       if (![operation.operatorName isEqualToString:@"const"]) {
-                         NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
-                       }
-                     }
+#if HAS_COREMLPLAN
+  dispatch_semaphore_t fd_sema = dispatch_semaphore_create(0);
+  [MLComputePlan loadContentsOfURL:compileUrl
+                     configuration:config
+                 completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
+                   if (!computePlan) {
+                     NSLog(@"Error loading compute plan: %@", error);
+                     // Handle error.
+                     return;
+                   }
+                   MLModelStructureProgram* program = computePlan.modelStructure.program;
+                   if (!program) {
+                     NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
+                     return;
+                   }
+
+                   [computePlan.modelStructure.program.functions enumerateKeysAndObjectsUsingBlock:^(NSString* function_name,
+                                                                                                     MLModelStructureProgramFunction* function,
+                                                                                                     BOOL* _Nonnull __unused stop) {
+                     NSLog(@"profile function : %@", function_name);
+                     ProfileBlock(computePlan, function.block);
+                     dispatch_semaphore_signal(fd_sema);
                    }];
+                 }];
+  long status = dispatch_semaphore_wait(fd_sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(5 * 60 * NSEC_PER_SEC)));
+  if (status != 0) {
+    NSLog(@"profile function : timeout");
+  }
+#endif
+}
+
+#if __has_include(<CoreML/MLOptimizationHints.h>)
+#define HAS_COREMLOPTIMIZATIONHINT 1
+#else
+#define HAS_COREMLOPTIMIZATIONHINT 0
+#endif
+
+API_AVAILABLE_COREML8
+void ConfigureOptimizationHints(MLModelConfiguration* config, const CoreMLOptions& coreml_options) {
+#if HAS_COREMLOPTIMIZATIONHINT
+  MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
+  if (coreml_options.UseStrategy("FastPrediction")) {
+    optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
+    config.optimizationHints = optimizationHints;
+  } else if (coreml_options.UseStrategy("Default")) {
+    optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
+    config.optimizationHints = optimizationHints;
   } else {
-    NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
+    // not set
   }
 #endif
 }
 
+Status CompileOrReadCachedModel(NSURL* modelUrl, const CoreMLOptions& coreml_options,
+                                NSMutableString* compiled_model_path) {
+  NSURL* cached_model_base_url = modelUrl;
+  if (!coreml_options.CreateMLProgram()) {
+    cached_model_base_url = [cached_model_base_url URLByDeletingLastPathComponent];
+  }
+
+  NSURL* cached_model_url = [cached_model_base_url URLByAppendingPathComponent:@"compiled_model.mlmodelc"];
+  // if cached_model_url is existed, just return
+  NSError* error = nil;
+  NSString* cached_model_path = [cached_model_url path];
+  // to pass clang-tidy static analyzer
+  if (compiled_model_path == nil || cached_model_path == nil) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error creating cached model URL");
+  }
+  if ([[NSFileManager defaultManager] fileExistsAtPath:cached_model_path]) {
+    [compiled_model_path appendString:cached_model_path];
+    return Status::OK();
+  }
+
+  // TODO: Update this to version with callback handler as the API used here is deprecated.
+  // https://developer.apple.com/documentation/coreml/mlmodel/3929553-compilemodelaturl
+  // As we call loadModel during EP Compile there shouldn't be an issue letting the actual compile run in the
+  // background. We will have to check for completion in `predict` and block until it is done.
+  NSURL* compiled_model_url = [MLModel compileModelAtURL:modelUrl error:&error];
+  if (error != nil) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error compiling model: ",
+                           [[error localizedDescription] UTF8String]);
+  }
+
+  // to pass clang-tidy static analyzer
+  NSString* compiled_model_path_from_url = [compiled_model_url path];
+  if (compiled_model_url == nil || cached_model_url == nil || compiled_model_path_from_url == nil) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, " compiled_model_url is nil or cached_model_url is nil");
+  }
+  if (coreml_options.ModelCacheDirectory().empty()) {
+    [compiled_model_path appendString:compiled_model_path_from_url];
+    return Status::OK();
+  }
+
+  // save the compiled model if user has set a cache path
+  if (![[NSFileManager defaultManager] moveItemAtURL:compiled_model_url toURL:cached_model_url error:&error]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error copying compiled model to cache path: ",
+                           [[cached_model_url path] UTF8String], ", reason: ", [[error localizedDescription] UTF8String]);
+  }
+  // clang-tidy
+  NSString* cached_model_path_from_url = [cached_model_url path];
+  if (cached_model_path_from_url == nil) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "cached_model_path_from_url is nil");
+  }
+  [compiled_model_path appendString:cached_model_path_from_url];
+  return Status::OK();
+}
+
 // Internal Execution class
 // This class is part of the model class and handles the calls into CoreML. Specifically, it performs
 // 1. Compile the model by given path for execution
@@ -366,7 +457,7 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
  private:
   void cleanup();
   NSString* coreml_model_path_{nil};
-  NSString* compiled_model_path_{nil};
+  NSURL* compiled_model_url_{nil};
   const logging::Logger& logger_;
   CoreMLOptions coreml_options_;
   MLModel* model_{nil};
@@ -387,14 +478,18 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
 }
 
 void Execution::cleanup() {
+  // we keep the compiled model if the user has set a cache path
+  if (coreml_options_.ModelCacheDirectory().size()) {
+    return;
+  }
+  NSString* compiled_model_path = [compiled_model_url_ path];
   NSError* error = nil;
-  if (compiled_model_path_ != nil) {
-    [[NSFileManager defaultManager] removeItemAtPath:compiled_model_path_ error:&error];
+  if (compiled_model_path != nil) {
+    [[NSFileManager defaultManager] removeItemAtPath:compiled_model_path error:&error];
     if (error != nil) {
-      LOGS(logger_, ERROR) << "Failed cleaning up the compiled model: " << [compiled_model_path_ UTF8String]
+      LOGS(logger_, ERROR) << "Failed cleaning up the compiled model: " << [compiled_model_path UTF8String]
                            << ", error message: " << [[error localizedDescription] UTF8String];
     }
-    compiled_model_path_ = nil;
   }
 
 #if !defined(NDEBUG)
@@ -430,17 +525,10 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create model URL from path");
       }
 
-      // TODO: Update this to version with callback handler as the API used here is deprecated.
-      // https://developer.apple.com/documentation/coreml/mlmodel/3929553-compilemodelaturl
-      // As we call loadModel during EP Compile there shouldn't be an issue letting the actual compile run in the
-      // background. We will have to check for completion in `predict` and block until it is done.
-      NSURL* compileUrl = [MLModel compileModelAtURL:modelUrl error:&error];
-      if (error != nil) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error compiling model: ",
-                               [[error localizedDescription] UTF8String]);
-      }
-
-      compiled_model_path_ = [compileUrl path];
+      NSMutableString* compiled_model_path = [[NSMutableString alloc] init];
+      ORT_RETURN_IF_ERROR(CompileOrReadCachedModel(
+          [NSURL fileURLWithPath:coreml_model_path_], coreml_options_, compiled_model_path));
+      compiled_model_url_ = [NSURL fileURLWithPath:compiled_model_path];
 
       MLModelConfiguration* config = [[MLModelConfiguration alloc] init];
       uint32_t coreml_compute_unit = coreml_options_.ComputeUnits();
@@ -458,27 +546,22 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
         config.allowLowPrecisionAccumulationOnGPU = YES;
       }
 
-// Set the specialization strategy to FastPrediction  for macOS 10.15+
-// since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
-// Same as above comments for why we are checking __clang_major__.
-// we define __clang_analyzer__ here is for bypass static analysis
-#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
+      // Set the specialization strategy to FastPrediction  for macOS 10.15+
       if (HAS_COREML8_OR_LATER) {
-        MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
-        if (coreml_options_.UseStrategy("FastPrediction")) {
-          optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
-          config.optimizationHints = optimizationHints;
-        } else if (coreml_options_.UseStrategy("Default")) {
-          optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
-          config.optimizationHints = optimizationHints;
-        }
+        ConfigureOptimizationHints(config, coreml_options_);
+      } else {
+        LOGS(logger_, WARNING) << "iOS 17.4+/macOS 14.4+ or later is required to ConfigureOptimizationHints";
       }
-#endif
+
       if (coreml_options_.ProfileComputePlan()) {
-        ProfileComputePlan(compileUrl, config);
+        if (@available(macOS 14.4, iOS 17.4, *)) {
+          ProfileComputePlan(compiled_model_url_, config);
+        } else {
+          LOGS(logger_, WARNING) << "iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API";
+        }
       }
 
-      model_ = [MLModel modelWithContentsOfURL:compileUrl configuration:config error:&error];
+      model_ = [MLModel modelWithContentsOfURL:compiled_model_url_ configuration:config error:&error];
 
       if (error != nil || model_ == nil) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create MLModel",
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 23c3812ebd025..0b1b2bae6c972 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -138,6 +138,7 @@ namespace perftest {
       "\t    [CoreML only] [SpecializationStrategy]:[Default FastPrediction].\n"
       "\t    [CoreML only] [ProfileComputePlan]:[0 1].\n"
       "\t    [CoreML only] [AllowLowPrecisionAccumulationOnGPU]:[0 1].\n"
+      "\t    [CoreML only] [ModelCacheDirectory]:[path../a/b/c].\n"
       "\t    [Example] [For CoreML EP] -e coreml -i \"ModelFormat|MLProgram MLComputeUnits|CPUAndGPU\"\n"
       "\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index a96028ed3903e..08c2cff8058c2 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -349,7 +349,8 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
                                                                    kCoremlProviderOption_EnableOnSubgraphs,
                                                                    kCoremlProviderOption_SpecializationStrategy,
                                                                    kCoremlProviderOption_ProfileComputePlan,
-                                                                   kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU};
+                                                                   kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
+                                                                   kCoremlProviderOption_ModelCacheDirectory};
     ParseSessionConfigs(ov_string, provider_options, available_keys);
 
     std::unordered_map<std::string, std::string> available_options = {
@@ -373,6 +374,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
                  (provider_option.second == "0" || provider_option.second == "1")) {
       } else if (provider_option.first == kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU &&
                  (provider_option.second == "0" || provider_option.second == "1")) {
+      } else if (provider_option.first == kCoremlProviderOption_ModelCacheDirectory) {
       } else {
         ORT_THROW("Invalid value for option ", provider_option.first, ": ", provider_option.second);
       }
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index a8480e7416de5..302ad57fb88c5 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -15,6 +15,7 @@
 #include "test/util/include/inference_session_wrapper.h"
 #include "test/util/include/test_environment.h"
 #include "test/util/include/test_utils.h"
+#include "onnx/onnx_pb.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 // if this is a full build we need the provider test utils
@@ -31,9 +32,10 @@ namespace onnxruntime {
 namespace test {
 
 static std::unique_ptr<IExecutionProvider> MakeCoreMLExecutionProvider(
-    std::string ModelFormat = "NeuralNetwork", std::string ComputeUnits = "CPUOnly") {
+    std::string ModelFormat = "NeuralNetwork", std::string ComputeUnits = "CPUOnly", std::string ModelCacheDirectory = "") {
   std::unordered_map<std::string, std::string> provider_options = {{kCoremlProviderOption_MLComputeUnits, ComputeUnits},
-                                                                   {kCoremlProviderOption_ModelFormat, ModelFormat}};
+                                                                   {kCoremlProviderOption_ModelFormat, ModelFormat},
+                                                                   {kCoremlProviderOption_ModelCacheDirectory, ModelCacheDirectory}};
   return CoreMLProviderFactoryCreator::Create(provider_options)->CreateProvider();
 }
 
@@ -268,5 +270,112 @@ TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
 }
 #endif
 
+TEST(CoreMLExecutionProviderTest, TestModelCache) {
+  const ORTCHAR_T* model_file_name = ORT_TSTR("testdata/coreml_argmax_cast_test.onnx");
+
+  onnx::ModelProto model;
+  {
+    std::ifstream in(model_file_name, std::ios_base::binary);
+    model.ParseFromIstream(&in);
+    in.close();
+  }
+
+  std::string out_string;
+#if defined(__APPLE__)
+  std::vector<int64_t> dims_mul_x = {3, 2, 2};
+  std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = std::make_shared<CPUAllocator>();
+  CreateMLValue<float>(allocator, dims_mul_x, values_mul_x, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+  std::string subgraph_name;
+  const std::function<void(const Graph&)> graph_verifier = [&subgraph_name](const Graph& graph) {
+    GraphViewer graph_viewer{graph};
+    const auto& node_indices_in_order = graph_viewer.GetNodesInTopologicalOrder();
+    const auto* node = graph.GetNode(node_indices_in_order[0]);
+    auto _first = node->Name().find('_') + 1;
+    auto _second = node->Name().find('_', _first);
+    subgraph_name = node->Name().substr(_first, _second - _first);
+  };
+  EPVerificationParams verification_params{.graph_verifier = &graph_verifier};
+
+  auto* metadata_props = model.add_metadata_props();
+  metadata_props->set_key(kCOREML_CACHE_KEY);
+  {  // test with valid model cache directory
+    metadata_props->set_value("legalhash123");
+    model.SerializeToString(&out_string);
+    gsl::span<const std::byte> model_data{reinterpret_cast<const std::byte*>(out_string.data()), out_string.size()};
+    RunAndVerifyOutputsWithEP(model_data, CurrentTestName(),
+                              MakeCoreMLExecutionProvider("MLProgram", "CPUOnly", ORT_TSTR("./tmp/")),
+                              feeds,
+                              verification_params);
+    ASSERT_EQ(std::filesystem::exists("./tmp/legalhash123"), true);
+  }
+  {
+    // test with invalid model cache directory, only alphanumeric characters are allowed
+    out_string.clear();
+    metadata_props->set_key(kCOREML_CACHE_KEY);
+    metadata_props->set_value("illegalhash__123");
+    model.SerializeToString(&out_string);
+    gsl::span<const std::byte> model_data{reinterpret_cast<const std::byte*>(out_string.data()), out_string.size()};
+    RunAndVerifyOutputsWithEP(model_data, CurrentTestName(),
+                              MakeCoreMLExecutionProvider("MLProgram", "CPUOnly", ORT_TSTR("./tmp")),
+                              feeds,
+                              verification_params);
+    ASSERT_EQ(std::filesystem::exists("./tmp/illegalhash__123"), false);
+    // the cache folder name should be the first part of the subgraph name
+    ASSERT_EQ(std::filesystem::exists("./tmp/" + subgraph_name), true);
+  }
+  {
+    // test with invalid model cache directory,  more than 64 characters
+    out_string.clear();
+    metadata_props->set_key(kCOREML_CACHE_KEY);
+    metadata_props->set_value("modelhashwithmorethan64charactersmodelhashwithmorethan64charactersmodelhashwithmorethan64characters");
+    model.SerializeToString(&out_string);
+    gsl::span<const std::byte> model_data{reinterpret_cast<const std::byte*>(out_string.data()), out_string.size()};
+    RunAndVerifyOutputsWithEP(model_data, CurrentTestName(),
+                              MakeCoreMLExecutionProvider("MLProgram", "CPUOnly", ORT_TSTR("./tmp")),
+                              feeds,
+                              verification_params);
+    ASSERT_EQ(std::filesystem::exists("./tmp/modelhashwithmorethan64charactersmodelhashwithmorethan64charactersmodelhashwithmorethan64characters"), false);
+    // the cache folder name should be the first part of the subgraph name
+    ASSERT_EQ(std::filesystem::exists("./tmp/" + subgraph_name), true);
+  }
+  {
+    // test with invalid model cache directory,  empty
+    out_string.clear();
+    metadata_props->set_key(kCOREML_CACHE_KEY);
+    metadata_props->set_value("");
+    model.SerializeToString(&out_string);
+    gsl::span<const std::byte> model_data{reinterpret_cast<const std::byte*>(out_string.data()), out_string.size()};
+    RunAndVerifyOutputsWithEP(model_data, CurrentTestName(),
+                              MakeCoreMLExecutionProvider("MLProgram", "CPUOnly", ORT_TSTR("./tmp")),
+                              feeds,
+                              verification_params);
+    // the cache folder name should be the first part of the subgraph name
+    ASSERT_EQ(std::filesystem::exists("./tmp/" + subgraph_name), true);
+  }
+  {
+    // test with invalid model cache directory, caching shall be disabled
+    out_string.clear();
+    metadata_props->set_key(kCOREML_CACHE_KEY);
+    metadata_props->set_value("");
+    model.SerializeToString(&out_string);
+    gsl::span<const std::byte> model_data{reinterpret_cast<const std::byte*>(out_string.data()), out_string.size()};
+    RunAndVerifyOutputsWithEP(model_data, CurrentTestName(),
+                              MakeCoreMLExecutionProvider("MLProgram", "CPUOnly", ORT_TSTR("/")),
+                              feeds,
+                              verification_params);
+    // this folder can't be created
+    ASSERT_EQ(std::filesystem::exists("/" + subgraph_name), false);
+  }
+#else
+  model.SerializeToString(&out_string);
+  gsl::span<const std::byte> model_data{reinterpret_cast<const std::byte*>(out_string.data()), out_string.size()};
+  TestModelLoad(model_data, MakeCoreMLExecutionProvider(), ExpectedEPNodeAssignment::All);
+#endif
+}
 }  // namespace test
 }  // namespace onnxruntime