From 2c71e9b322e1c79fad4ad6e008d4958e937400e9 Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Tue, 11 Jun 2024 02:21:54 +0800
Subject: [PATCH] Use std::optional (#7228)

---
 test/cpp/cpp_test_util.cpp              |   4 +-
 test/cpp/test_aten_xla_tensor_1.cpp     |  14 +-
 test/cpp/test_aten_xla_tensor_2.cpp     |  44 ++--
 test/cpp/test_aten_xla_tensor_4.cpp     |   8 +-
 test/cpp/test_lazy.cpp                  |  10 +-
 test/cpp/test_tensor.cpp                |   2 +-
 torch_xla/csrc/aten_xla_bridge.cpp      |  44 ++--
 torch_xla/csrc/aten_xla_bridge.h        |  32 +--
 torch_xla/csrc/aten_xla_type.cpp        | 320 ++++++++++++------------
 torch_xla/csrc/convert_ops.cpp          |   2 +-
 torch_xla/csrc/convert_ops.h            |   2 +-
 torch_xla/csrc/data_ops.cpp             |   2 +-
 torch_xla/csrc/elementwise.cpp          |   2 +-
 torch_xla/csrc/elementwise.h            |   2 +-
 torch_xla/csrc/helpers.cpp              |   4 +-
 torch_xla/csrc/helpers.h                |   4 +-
 torch_xla/csrc/init_python_bindings.cpp |   4 +-
 torch_xla/csrc/ir_builder.h             |   4 +-
 torch_xla/csrc/ops/cast.cpp             |   2 +-
 torch_xla/csrc/ops/cast.h               |  10 +-
 torch_xla/csrc/ops/count_nonzero.h      |   2 +-
 torch_xla/csrc/ops/cumprod.cpp          |   6 +-
 torch_xla/csrc/ops/cumprod.h            |   6 +-
 torch_xla/csrc/ops/cumsum.cpp           |   6 +-
 torch_xla/csrc/ops/cumsum.h             |   6 +-
 torch_xla/csrc/ops/index_ops.cpp        |  12 +-
 torch_xla/csrc/ops/index_ops.h          |   2 +-
 torch_xla/csrc/ops/log_softmax.cpp      |   6 +-
 torch_xla/csrc/ops/log_softmax.h        |   6 +-
 torch_xla/csrc/ops/mean.cpp             |   6 +-
 torch_xla/csrc/ops/mean.h               |   6 +-
 torch_xla/csrc/ops/native_dropout.cpp   |   2 +-
 torch_xla/csrc/ops/native_dropout.h     |   4 +-
 torch_xla/csrc/ops/ops.cpp              |  12 +-
 torch_xla/csrc/ops/ops.h                |  12 +-
 torch_xla/csrc/ops/ops_xla_shape_fn.cpp |  14 +-
 torch_xla/csrc/ops/ops_xla_shape_fn.h   |  12 +-
 torch_xla/csrc/ops/prod.cpp             |   6 +-
 torch_xla/csrc/ops/prod.h               |   6 +-
 torch_xla/csrc/ops/softmax.cpp          |   6 +-
 torch_xla/csrc/ops/softmax.h            |   6 +-
 torch_xla/csrc/ops/sum.cpp              |   6 +-
 torch_xla/csrc/ops/sum.h                |   6 +-
 torch_xla/csrc/tensor.cpp               |  42 ++--
 torch_xla/csrc/tensor.h                 |  30 +--
 torch_xla/csrc/tensor_methods.cpp       | 116 ++++-----
 torch_xla/csrc/tensor_methods.h         |  84 +++----
 torch_xla/csrc/tensor_ops.cpp           |   2 +-
 torch_xla/csrc/tensor_ops.h             |   2 +-
 torch_xla/csrc/torch_util.h             |   4 +-
 torch_xla/csrc/xla_backend_impl.cpp     |   2 +-
 torch_xla/csrc/xla_graph_executor.cpp   |  12 +-
 torch_xla/csrc/xla_graph_executor.h     |   4 +-
 torch_xla/csrc/xla_lower_util.cpp       |   2 +-
 torch_xla/csrc/xla_lower_util.h         |   2 +-
 55 files changed, 487 insertions(+), 487 deletions(-)

diff --git a/test/cpp/cpp_test_util.cpp b/test/cpp/cpp_test_util.cpp
index f0a2d374d91..15c06d0714e 100644
--- a/test/cpp/cpp_test_util.cpp
+++ b/test/cpp/cpp_test_util.cpp
@@ -379,11 +379,11 @@ void TestBackward(
     // Calculating higher order derivative requires create_graph=true
     bool create_graph = d != derivative_level;
     outs = torch::autograd::grad({sum}, inputs_w_grad, /*grad_outputs=*/{},
-                                 /*retain_graph=*/c10::nullopt,
+                                 /*retain_graph=*/std::nullopt,
                                  /*create_graph=*/create_graph,
                                  /*allow_unused=*/true);
     xouts = torch::autograd::grad({xsum}, xinputs_w_grad, /*grad_outputs=*/{},
-                                  /*retain_graph=*/c10::nullopt,
+                                  /*retain_graph=*/std::nullopt,
                                   /*create_graph=*/create_graph,
                                   /*allow_unused=*/true);
     for (size_t i = 0; i < outs.size(); ++i) {
diff --git a/test/cpp/test_aten_xla_tensor_1.cpp b/test/cpp/test_aten_xla_tensor_1.cpp
index 5991c10d67e..d204b344808 100644
--- a/test/cpp/test_aten_xla_tensor_1.cpp
+++ b/test/cpp/test_aten_xla_tensor_1.cpp
@@ -1080,10 +1080,10 @@ TEST_F(AtenXlaTensorTest, TestUpsampleNearest2DWithScale) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor xla_input = CopyToDevice(input, device);
       torch::Tensor result = torch::upsample_nearest2d(
-          input, c10::nullopt,
+          input, std::nullopt,
           at::ArrayRef<double>{img_info.scale_h, img_info.scale_w});
       torch::Tensor xla_result = torch::upsample_nearest2d(
-          xla_input, c10::nullopt,
+          xla_input, std::nullopt,
           at::ArrayRef<double>{img_info.scale_h, img_info.scale_w});
       AllClose(result, xla_result);
     });
@@ -1116,7 +1116,7 @@ TEST_F(AtenXlaTensorTest, TestUpsampleNearest2DBackwardWithScale) {
       auto testfn =
           [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
         return torch::upsample_nearest2d(
-            inputs[0], c10::nullopt,
+            inputs[0], std::nullopt,
             at::ArrayRef<double>{img_info.scale_h, img_info.scale_w});
       };
       ForEachDevice([&](const torch::Device& device) {
@@ -1208,10 +1208,10 @@ TEST_F(AtenXlaTensorTest, TestUpsampleBilinear2DWithScale) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor xla_input = CopyToDevice(input, device);
         torch::Tensor result = torch::upsample_bilinear2d(
-            input, c10::nullopt, align_corners,
+            input, std::nullopt, align_corners,
             at::ArrayRef<double>{img_info.scale_h, img_info.scale_w});
         torch::Tensor xla_result = torch::upsample_bilinear2d(
-            xla_input, c10::nullopt, align_corners,
+            xla_input, std::nullopt, align_corners,
             at::ArrayRef<double>{img_info.scale_h, img_info.scale_w});
         AllClose(result, xla_result, /*rtol=*/1e-4, /*atol=*/1e-4);
       });
@@ -1266,7 +1266,7 @@ TEST_F(AtenXlaTensorTest, TestUpsampleBilinear2DBackwardWithScale) {
       auto testfn =
           [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
         return torch::upsample_bilinear2d(
-            inputs[0], c10::nullopt, align_corners,
+            inputs[0], std::nullopt, align_corners,
             at::ArrayRef<double>{img_info.scale_h, img_info.scale_w});
       };
       ForEachDevice([&](const torch::Device& device) {
@@ -2389,7 +2389,7 @@ TEST_F(AtenXlaTensorTest, TestCount_Nonzero_with_single_dim) {
   a[0][1] = 1.0;
   a[0][2] = 1.0;
   a[2][2] = 1.0;
-  std::vector<c10::optional<long int>> dims = {0, -1};
+  std::vector<std::optional<long int>> dims = {0, -1};
   for (int i = 0; i < dims.size(); i++) {
     torch::Tensor b = torch::count_nonzero(a, dims[i]);
     ForEachDevice([&](const torch::Device& device) {
diff --git a/test/cpp/test_aten_xla_tensor_2.cpp b/test/cpp/test_aten_xla_tensor_2.cpp
index f190403dd87..bf30e1f0c7b 100755
--- a/test/cpp/test_aten_xla_tensor_2.cpp
+++ b/test/cpp/test_aten_xla_tensor_2.cpp
@@ -1022,7 +1022,7 @@ TEST_F(AtenXlaTensorTest, TestStdInDim) {
 TEST_F(AtenXlaTensorTest, TestStdWithCorrection) {
   torch::Tensor a = torch::rand({4, 3, 4}, torch::TensorOptions(torch::kFloat));
   int rank = a.dim();
-  c10::optional<torch::Scalar> corrections[] = {1, 2, 1.3, c10::nullopt};
+  std::optional<torch::Scalar> corrections[] = {1, 2, 1.3, std::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
@@ -1041,7 +1041,7 @@ TEST_F(AtenXlaTensorTest, TestStdWithCorrection) {
 TEST_F(AtenXlaTensorTest, TestStdMeanWithCorrection) {
   torch::Tensor a = torch::rand({4, 3, 4}, torch::TensorOptions(torch::kFloat));
   int rank = a.dim();
-  c10::optional<torch::Scalar> corrections[] = {1, 2, 1.3, c10::nullopt};
+  std::optional<torch::Scalar> corrections[] = {1, 2, 1.3, std::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
@@ -1175,7 +1175,7 @@ TEST_F(AtenXlaTensorTest, TestVarWithDim) {
 
 TEST_F(AtenXlaTensorTest, TestVarWithCorrection) {
   torch::Tensor a = torch::rand({4, 3, 4}, torch::TensorOptions(torch::kFloat));
-  c10::optional<torch::Scalar> corrections[] = {1, 2, 1.3, c10::nullopt};
+  std::optional<torch::Scalar> corrections[] = {1, 2, 1.3, std::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (bool keepDim : {true, false}) {
       for (const auto& correction : corrections) {
@@ -1194,7 +1194,7 @@ TEST_F(AtenXlaTensorTest, TestVarWithCorrection) {
 
 TEST_F(AtenXlaTensorTest, TestVarMeanWithCorrection) {
   torch::Tensor a = torch::rand({4, 3, 4}, torch::TensorOptions(torch::kFloat));
-  c10::optional<torch::Scalar> corrections[] = {1, 2, 1.3, c10::nullopt};
+  std::optional<torch::Scalar> corrections[] = {1, 2, 1.3, std::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (const auto& correction : corrections) {
       for (auto keepdim : {true, false}) {
@@ -2076,10 +2076,10 @@ TEST_F(AtenXlaTensorTest, TestCumProdCastLong) {
 
 TEST_F(AtenXlaTensorTest, TestArgMin) {
   torch::Tensor a = torch::rand({4, 4, 4}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::argmin(a, c10::nullopt, /*keepdim=*/false);
+  torch::Tensor b = torch::argmin(a, std::nullopt, /*keepdim=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::argmin(xla_a, c10::nullopt, /*keepdim=*/false);
+    torch::Tensor xla_b = torch::argmin(xla_a, std::nullopt, /*keepdim=*/false);
     AllEqual(b, xla_b);
   });
 
@@ -2119,10 +2119,10 @@ TEST_F(AtenXlaTensorTest, TestArgMinDimKeep) {
 
 TEST_F(AtenXlaTensorTest, TestArgMinDimKeepNoDim) {
   torch::Tensor a = torch::rand({4, 4, 4}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::argmin(a, c10::nullopt, /*keepdim=*/true);
+  torch::Tensor b = torch::argmin(a, std::nullopt, /*keepdim=*/true);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::argmin(xla_a, c10::nullopt, /*keepdim=*/true);
+    torch::Tensor xla_b = torch::argmin(xla_a, std::nullopt, /*keepdim=*/true);
     AllEqual(b, xla_b);
   });
 
@@ -2160,10 +2160,10 @@ TEST_F(AtenXlaTensorTest, TestArgMinWrapper) {
 
 TEST_F(AtenXlaTensorTest, TestArgMax) {
   torch::Tensor a = torch::rand({4, 4, 4}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false);
+  torch::Tensor b = torch::argmax(a, std::nullopt, /*keepdim=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::argmax(xla_a, c10::nullopt, /*keepdim=*/false);
+    torch::Tensor xla_b = torch::argmax(xla_a, std::nullopt, /*keepdim=*/false);
     AllEqual(b, xla_b);
   });
 
@@ -2203,10 +2203,10 @@ TEST_F(AtenXlaTensorTest, TestArgMaxDimKeep) {
 
 TEST_F(AtenXlaTensorTest, TestArgMaxDimKeepNoDim) {
   torch::Tensor a = torch::rand({4, 4, 4}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/true);
+  torch::Tensor b = torch::argmax(a, std::nullopt, /*keepdim=*/true);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::argmax(xla_a, c10::nullopt, /*keepdim=*/true);
+    torch::Tensor xla_b = torch::argmax(xla_a, std::nullopt, /*keepdim=*/true);
     AllEqual(b, xla_b);
   });
 
@@ -2216,10 +2216,10 @@ TEST_F(AtenXlaTensorTest, TestArgMaxDimKeepNoDim) {
 
 TEST_F(AtenXlaTensorTest, TestArgMaxSameValue) {
   torch::Tensor a = torch::ones({4, 4, 4}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false);
+  torch::Tensor b = torch::argmax(a, std::nullopt, /*keepdim=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::argmax(xla_a, c10::nullopt, /*keepdim=*/false);
+    torch::Tensor xla_b = torch::argmax(xla_a, std::nullopt, /*keepdim=*/false);
     AllEqual(b, xla_b);
   });
 
@@ -2511,10 +2511,10 @@ TEST_F(AtenXlaTensorTest, TestClampMinMaxTensor) {
 TEST_F(AtenXlaTensorTest, TestClampMin) {
   torch::Tensor a = torch::rand({2, 2}, torch::TensorOptions(torch::kFloat));
   torch::Scalar min_val(0.311);
-  torch::Tensor b = torch::clamp(a, min_val, c10::nullopt);
+  torch::Tensor b = torch::clamp(a, min_val, std::nullopt);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::clamp(xla_a, min_val, c10::nullopt);
+    torch::Tensor xla_b = torch::clamp(xla_a, min_val, std::nullopt);
     AllClose(b, xla_b);
   });
   ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
@@ -2525,11 +2525,11 @@ TEST_F(AtenXlaTensorTest, TestClampMinTensor) {
   torch::Tensor a = torch::rand({2, 2}, torch::TensorOptions(torch::kFloat));
   torch::Tensor min_tensor =
       torch::rand({1, 2}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::clamp(a, min_tensor, c10::nullopt);
+  torch::Tensor b = torch::clamp(a, min_tensor, std::nullopt);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
     torch::Tensor xla_min_tensor = CopyToDevice(min_tensor, device);
-    torch::Tensor xla_b = torch::clamp(xla_a, xla_min_tensor, c10::nullopt);
+    torch::Tensor xla_b = torch::clamp(xla_a, xla_min_tensor, std::nullopt);
     AllClose(b, xla_b);
   });
   ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
@@ -2539,10 +2539,10 @@ TEST_F(AtenXlaTensorTest, TestClampMinTensor) {
 TEST_F(AtenXlaTensorTest, TestClampMax) {
   torch::Tensor a = torch::rand({2, 2}, torch::TensorOptions(torch::kFloat));
   torch::Scalar max_val(0.409);
-  torch::Tensor b = torch::clamp(a, c10::nullopt, max_val);
+  torch::Tensor b = torch::clamp(a, std::nullopt, max_val);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
-    torch::Tensor xla_b = torch::clamp(xla_a, c10::nullopt, max_val);
+    torch::Tensor xla_b = torch::clamp(xla_a, std::nullopt, max_val);
     AllClose(b, xla_b);
   });
   ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
@@ -2553,11 +2553,11 @@ TEST_F(AtenXlaTensorTest, TestClampMaxTensor) {
   torch::Tensor a = torch::rand({2, 2}, torch::TensorOptions(torch::kFloat));
   torch::Tensor max_tensor =
       torch::rand({2, 1}, torch::TensorOptions(torch::kFloat));
-  torch::Tensor b = torch::clamp(a, c10::nullopt, max_tensor);
+  torch::Tensor b = torch::clamp(a, std::nullopt, max_tensor);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xla_a = CopyToDevice(a, device);
     torch::Tensor xla_max_tensor = CopyToDevice(max_tensor, device);
-    torch::Tensor xla_b = torch::clamp(xla_a, c10::nullopt, xla_max_tensor);
+    torch::Tensor xla_b = torch::clamp(xla_a, std::nullopt, xla_max_tensor);
     AllClose(b, xla_b);
   });
   ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
diff --git a/test/cpp/test_aten_xla_tensor_4.cpp b/test/cpp/test_aten_xla_tensor_4.cpp
index 7a02a1079a6..6c21ed5f901 100644
--- a/test/cpp/test_aten_xla_tensor_4.cpp
+++ b/test/cpp/test_aten_xla_tensor_4.cpp
@@ -391,8 +391,8 @@ TEST_F(AtenXlaTensorTest, TestDiv) {
 }
 
 TEST_F(AtenXlaTensorTest, TestDivWithRoundingMode) {
-  c10::optional<c10::string_view> rounding_modes[] = {"trunc", "floor",
-                                                      c10::nullopt};
+  std::optional<c10::string_view> rounding_modes[] = {"trunc", "floor",
+                                                      std::nullopt};
   for (const auto& rounding_mode : rounding_modes) {
     for (torch::ScalarType scalar_type1 :
          {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt,
@@ -453,8 +453,8 @@ TEST_F(AtenXlaTensorTest, TestDivInPlace) {
 }
 
 TEST_F(AtenXlaTensorTest, TestDivInPlaceWithRoundingMode) {
-  c10::optional<c10::string_view> rounding_modes[] = {"trunc", "floor",
-                                                      c10::nullopt};
+  std::optional<c10::string_view> rounding_modes[] = {"trunc", "floor",
+                                                      std::nullopt};
   for (const auto& rounding_mode : rounding_modes) {
     for (torch::ScalarType scalar_type1 : {torch::kFloat}) {
       torch::Tensor a =
diff --git a/test/cpp/test_lazy.cpp b/test/cpp/test_lazy.cpp
index e445b93454e..b3907a24007 100644
--- a/test/cpp/test_lazy.cpp
+++ b/test/cpp/test_lazy.cpp
@@ -24,7 +24,7 @@ TEST_F(LazyTest, TestXlaShapeToLazyWithF64) {
   torch::lazy::Shape lazy_shape = XlaHelpers::ConvertXlaShapeToLazy(xla_shape);
   std::vector<int64_t> lazy_dimensions =
       torch_xla::runtime::util::ToVector<int64_t>(lazy_shape.sizes());
-  const c10::optional<std::vector<bool>>& lazy_dynamic_dimensions =
+  const std::optional<std::vector<bool>>& lazy_dynamic_dimensions =
       lazy_shape.is_symbolic();
   EXPECT_EQ(lazy_shape.scalar_type(), at::ScalarType::Double);
   EXPECT_EQ(lazy_dimensions,
@@ -46,7 +46,7 @@ TEST_F(LazyTest, TestXlaShapeToLazyWithPred) {
   torch::lazy::Shape lazy_shape = XlaHelpers::ConvertXlaShapeToLazy(xla_shape);
   std::vector<int64_t> lazy_dimensions =
       torch_xla::runtime::util::ToVector<int64_t>(lazy_shape.sizes());
-  const c10::optional<std::vector<bool>>& lazy_dynamic_dimensions =
+  const std::optional<std::vector<bool>>& lazy_dynamic_dimensions =
       lazy_shape.is_symbolic();
   EXPECT_EQ(lazy_shape.scalar_type(), at::ScalarType::Bool);
   EXPECT_EQ(lazy_dimensions,
@@ -68,7 +68,7 @@ TEST_F(LazyTest, TestXlaShapeToLazyWithU64) {
   torch::lazy::Shape lazy_shape = XlaHelpers::ConvertXlaShapeToLazy(xla_shape);
   std::vector<int64_t> lazy_dimensions =
       torch_xla::runtime::util::ToVector<int64_t>(lazy_shape.sizes());
-  const c10::optional<std::vector<bool>>& lazy_dynamic_dimensions =
+  const std::optional<std::vector<bool>>& lazy_dynamic_dimensions =
       lazy_shape.is_symbolic();
   EXPECT_EQ(lazy_shape.scalar_type(), at::ScalarType::Long);
   EXPECT_EQ(lazy_dimensions,
@@ -90,7 +90,7 @@ TEST_F(LazyTest, TestXlaShapeToLazyWithMultipleDimensions) {
   torch::lazy::Shape lazy_shape = XlaHelpers::ConvertXlaShapeToLazy(xla_shape);
   std::vector<int64_t> lazy_dimensions =
       torch_xla::runtime::util::ToVector<int64_t>(lazy_shape.sizes());
-  const c10::optional<std::vector<bool>>& lazy_dynamic_dimensions =
+  const std::optional<std::vector<bool>>& lazy_dynamic_dimensions =
       lazy_shape.is_symbolic();
   EXPECT_EQ(lazy_shape.scalar_type(), at::ScalarType::Double);
   EXPECT_EQ(lazy_dimensions,
@@ -112,7 +112,7 @@ TEST_F(LazyTest, TestXlaShapeToLazyWithDynamicDimensions) {
   torch::lazy::Shape lazy_shape = XlaHelpers::ConvertXlaShapeToLazy(xla_shape);
   std::vector<int64_t> lazy_dimensions =
       torch_xla::runtime::util::ToVector<int64_t>(lazy_shape.sizes());
-  const c10::optional<std::vector<bool>>& lazy_dynamic_dimensions =
+  const std::optional<std::vector<bool>>& lazy_dynamic_dimensions =
       lazy_shape.is_symbolic();
   EXPECT_EQ(lazy_shape.scalar_type(), at::ScalarType::Double);
   EXPECT_EQ(lazy_dimensions,
diff --git a/test/cpp/test_tensor.cpp b/test/cpp/test_tensor.cpp
index eb3b52676eb..70fc2b60719 100644
--- a/test/cpp/test_tensor.cpp
+++ b/test/cpp/test_tensor.cpp
@@ -18,7 +18,7 @@ namespace {
 
 bool CheckBidirectionalConversion(
     const at::Tensor& input, at::ScalarType dest_element_type,
-    c10::optional<xla::PrimitiveType> xla_type = c10::nullopt) {
+    std::optional<xla::PrimitiveType> xla_type = std::nullopt) {
   xla::Literal literal =
       GetTensorLiteral(input, /*shape=*/nullptr, /*device=*/nullptr);
   if (xla_type) {
diff --git a/torch_xla/csrc/aten_xla_bridge.cpp b/torch_xla/csrc/aten_xla_bridge.cpp
index d091e616c40..8c94b2d9c5a 100644
--- a/torch_xla/csrc/aten_xla_bridge.cpp
+++ b/torch_xla/csrc/aten_xla_bridge.cpp
@@ -148,7 +148,7 @@ XLATensorPtr GetOrCreateXlaTensor(const at::Tensor& tensor,
   return xtensor ? xtensor : XLATensor::Create(inner_tensor, device);
 }
 
-XLATensorPtr GetOrCreateXlaTensor(const c10::optional<at::Tensor>& tensor,
+XLATensorPtr GetOrCreateXlaTensor(const std::optional<at::Tensor>& tensor,
                                   const torch::lazy::BackendDevice& device) {
   if (!IsDefined(tensor)) {
     return XLATensorPtr();
@@ -210,9 +210,9 @@ std::vector<at::Tensor> XlaCreateTensorList(const at::ITensorListRef& tensors) {
   return aten_xla_tensors;
 }
 
-std::vector<c10::optional<at::Tensor>> XlaCreateOptTensorList(
-    const std::vector<c10::optional<at::Tensor>>& tensors) {
-  std::vector<c10::optional<at::Tensor>> opt_aten_xla_tensors(tensors.size());
+std::vector<std::optional<at::Tensor>> XlaCreateOptTensorList(
+    const std::vector<std::optional<at::Tensor>>& tensors) {
+  std::vector<std::optional<at::Tensor>> opt_aten_xla_tensors(tensors.size());
   std::vector<at::Tensor> materialized_tensors;
   std::vector<bool> to_translate(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
@@ -253,24 +253,24 @@ void XlaUpdateTensors(absl::Span<const at::Tensor> dest_xla_tensors,
   }
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const at::Tensor& tensor) {
   auto xtensor = TryGetXlaTensor(tensor);
   if (!xtensor) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return xtensor->GetDevice();
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
-    const c10::optional<at::Tensor>& tensor) {
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
+    const std::optional<at::Tensor>& tensor) {
   if (!tensor.has_value()) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return GetXlaDevice(*tensor);
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const at::TensorList& tensors) {
   for (const auto& tensor : tensors) {
     auto device = GetXlaDevice(tensor);
@@ -278,10 +278,10 @@ c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
       return device;
     }
   }
-  return c10::nullopt;
+  return std::nullopt;
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const std::vector<at::Tensor>& tensors) {
   for (const auto& tensor : tensors) {
     auto device = GetXlaDevice(tensor);
@@ -289,29 +289,29 @@ c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
       return device;
     }
   }
-  return c10::nullopt;
+  return std::nullopt;
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const at::TensorOptions& tensor_options) {
   if (!tensor_options.has_device()) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return GetXlaDevice(tensor_options.device());
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const c10::Device& device) {
   if (device.type() != at::kXLA) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return AtenDeviceToXlaDevice(device);
 }
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
-    const c10::optional<c10::Device>& device) {
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
+    const std::optional<c10::Device>& device) {
   if (!device) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return GetXlaDevice(*device);
 }
@@ -434,7 +434,7 @@ std::vector<at::Tensor> AtenFromXlaTensors(
 
 at::Tensor CreateXlaTensor(
     at::Tensor tensor,
-    const c10::optional<torch::lazy::BackendDevice>& device) {
+    const std::optional<torch::lazy::BackendDevice>& device) {
   if (tensor.defined() && device) {
     XLATensorPtr xla_tensor = XLATensor::Create(std::move(tensor), *device);
     tensor = AtenFromXlaTensor(xla_tensor);
@@ -444,7 +444,7 @@ at::Tensor CreateXlaTensor(
 
 std::vector<at::Tensor> CreateXlaTensors(
     const std::vector<at::Tensor>& tensors,
-    const c10::optional<torch::lazy::BackendDevice>& device) {
+    const std::optional<torch::lazy::BackendDevice>& device) {
   std::vector<at::Tensor> xtensors;
   for (auto& tensor : tensors) {
     xtensors.push_back(CreateXlaTensor(tensor, device));
diff --git a/torch_xla/csrc/aten_xla_bridge.h b/torch_xla/csrc/aten_xla_bridge.h
index 19dd2b81412..b25a4823c3a 100644
--- a/torch_xla/csrc/aten_xla_bridge.h
+++ b/torch_xla/csrc/aten_xla_bridge.h
@@ -40,7 +40,7 @@ torch_xla::XLATensorPtr GetXlaTensorOrCreateForWrappedNumber(
 XLATensorPtr GetOrCreateXlaTensor(const at::Tensor& tensor,
                                   const torch::lazy::BackendDevice& device);
 
-XLATensorPtr GetOrCreateXlaTensor(const c10::optional<at::Tensor>& tensor,
+XLATensorPtr GetOrCreateXlaTensor(const std::optional<at::Tensor>& tensor,
                                   const torch::lazy::BackendDevice& device);
 
 std::vector<XLATensorPtr> GetOrCreateXlaTensors(
@@ -50,10 +50,10 @@ std::vector<XLATensorPtr> GetOrCreateXlaTensors(
 // Creates a vector of at::Tensor objects extracted from a list of XLA tensors.
 std::vector<at::Tensor> XlaCreateTensorList(const at::ITensorListRef& tensors);
 
-// Creates a vector of c10::optional<at::Tensor> objects extracted from a list
+// Creates a vector of std::optional<at::Tensor> objects extracted from a list
 // of optional XLA tensors.
-std::vector<c10::optional<at::Tensor>> XlaCreateOptTensorList(
-    const std::vector<c10::optional<at::Tensor>>& tensors);
+std::vector<std::optional<at::Tensor>> XlaCreateOptTensorList(
+    const std::vector<std::optional<at::Tensor>>& tensors);
 
 void XlaUpdateTensors(absl::Span<const at::Tensor> dest_xla_tensors,
                       absl::Span<const at::Tensor> source_cpu_tensors,
@@ -61,26 +61,26 @@ void XlaUpdateTensors(absl::Span<const at::Tensor> dest_xla_tensors,
 
 // Tries to extract the device out of the XLA tensor. Returns nullopt if the
 // input is not an XLA tensor.
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const at::Tensor& tensor);
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
-    const c10::optional<at::Tensor>& tensor);
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
+    const std::optional<at::Tensor>& tensor);
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const at::ITensorListRef& tensors);
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const std::vector<at::Tensor>& tensors);
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const at::TensorOptions& tensor_options);
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const c10::Device& device);
 
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
-    const c10::optional<c10::Device>& device = c10::nullopt);
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
+    const std::optional<c10::Device>& device = std::nullopt);
 
 std::vector<torch::lazy::BackendDevice> GetBackendDevices();
 
@@ -120,16 +120,16 @@ std::vector<at::Tensor> AtenFromXlaTensors(
 
 // Creates an XLA tensor holding the data in tensor, on the given device.
 at::Tensor CreateXlaTensor(
-    at::Tensor tensor, const c10::optional<torch::lazy::BackendDevice>& device);
+    at::Tensor tensor, const std::optional<torch::lazy::BackendDevice>& device);
 
 // Given a vector of at::Tensor creates a vector of XLA tensors on the given
 // device.
 std::vector<at::Tensor> CreateXlaTensors(
     const std::vector<at::Tensor>& tensors,
-    const c10::optional<torch::lazy::BackendDevice>& device);
+    const std::optional<torch::lazy::BackendDevice>& device);
 
 template <typename T, typename... Args>
-c10::optional<torch::lazy::BackendDevice> GetXlaDevice(
+std::optional<torch::lazy::BackendDevice> GetXlaDevice(
     const T& tensor, const Args&... forward_tensors) {
   auto optional_device = GetXlaDevice(tensor);
   if (optional_device) {
diff --git a/torch_xla/csrc/aten_xla_type.cpp b/torch_xla/csrc/aten_xla_type.cpp
index 7931b7ae7bc..dc30734756d 100644
--- a/torch_xla/csrc/aten_xla_type.cpp
+++ b/torch_xla/csrc/aten_xla_type.cpp
@@ -236,10 +236,10 @@ at::Tensor to_meta(const at::Tensor& tensor) {
   if (!tensor.defined()) return tensor;
   auto out = at::native::empty_strided_meta_symint(
       tensor.sym_sizes(), tensor.sym_strides(),
-      /*dtype=*/c10::make_optional(tensor.scalar_type()),
-      /*layout=*/c10::make_optional(tensor.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/std::make_optional(tensor.scalar_type()),
+      /*layout=*/std::make_optional(tensor.layout()),
+      /*device=*/std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/std::nullopt);
   // needs to handle wrapped numbers, so dtype promotion works properly.
   if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
     out.unsafeGetTensorImpl()->set_wrapped_number(true);
@@ -248,12 +248,12 @@ at::Tensor to_meta(const at::Tensor& tensor) {
 }
 
 torch::lazy::BackendDevice GetXlaDeviceOrCurrent(
-    const c10::optional<c10::Device>& device) {
+    const std::optional<c10::Device>& device) {
   auto xla_device_opt = bridge::GetXlaDevice(device);
   return xla_device_opt ? *xla_device_opt : bridge::GetCurrentDevice();
 }
 
-bool IsOperationOnType(const c10::optional<at::ScalarType>& opt_dtype,
+bool IsOperationOnType(const std::optional<at::ScalarType>& opt_dtype,
                        at::ScalarType tensor_type, at::ScalarType type) {
   if (opt_dtype && *opt_dtype == type) {
     return true;
@@ -263,7 +263,7 @@ bool IsOperationOnType(const c10::optional<at::ScalarType>& opt_dtype,
 
 bool TensorsAreOfType(std::vector<XLATensorPtr> tensors, at::ScalarType type) {
   for (const XLATensorPtr& tensor : tensors) {
-    if (IsOperationOnType(c10::optional<at::ScalarType>(c10::nullopt),
+    if (IsOperationOnType(std::optional<at::ScalarType>(std::nullopt),
                           tensor->dtype(), type)) {
       return true;
     }
@@ -281,8 +281,8 @@ void CheckSubOperandTypes(at::ScalarType type1, at::ScalarType type2) {
          "`logical_not()` operator instead.";
 }
 
-c10::optional<at::ScalarType> PromoteIntegralType(
-    at::ScalarType src_dtype, const c10::optional<at::ScalarType>& opt_dtype) {
+std::optional<at::ScalarType> PromoteIntegralType(
+    at::ScalarType src_dtype, const std::optional<at::ScalarType>& opt_dtype) {
   return opt_dtype.has_value() ? opt_dtype.value()
          : at::isIntegralType(src_dtype, /*includeBool=*/true) ? at::kLong
                                                                : opt_dtype;
@@ -344,8 +344,8 @@ std::pair<XLATensorPtr, XLATensorPtr> GetBinaryOperands(
 
 // The input is in format of {N, C, H, W} and the output will be {H, W}.
 std::vector<int64_t> GetOutputSizeWithScale(
-    absl::Span<const int64_t> input_size, const c10::optional<double> scales_h,
-    const c10::optional<double> scales_w,
+    absl::Span<const int64_t> input_size, const std::optional<double> scales_h,
+    const std::optional<double> scales_w,
     const std::vector<int64_t>& output_size) {
   XLA_CHECK(scales_h);
   XLA_CHECK(scales_w);
@@ -690,10 +690,10 @@ std::vector<at::Tensor> XLANativeFunctions::_to_cpu(at::TensorList tensors) {
 // TODO(alanwaketan): Improve the error messages.
 // Let's rewrite it without reusing other native functions.
 at::Tensor XLANativeFunctions::_to_copy(
-    const at::Tensor& self, c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout, c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory, bool non_blocking,
-    c10::optional<at::MemoryFormat> memory_format) {
+    const at::Tensor& self, std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout, std::optional<at::Device> device,
+    std::optional<bool> pin_memory, bool non_blocking,
+    std::optional<at::MemoryFormat> memory_format) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
 
   auto options = self.options();
@@ -736,7 +736,7 @@ at::Tensor XLANativeFunctions::_to_copy(
 }
 
 at::Tensor& XLANativeFunctions::_index_put_impl_(
-    at::Tensor& self, const c10::List<c10::optional<at::Tensor>>& indices,
+    at::Tensor& self, const c10::List<std::optional<at::Tensor>>& indices,
     const at::Tensor& values, bool accumulate, bool /* unsafe */) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return torch_xla::XLANativeFunctions::index_put_(self, indices, values,
@@ -763,7 +763,7 @@ at::Tensor XLANativeFunctions::_log_softmax(const at::Tensor& self, int64_t dim,
   std::vector<torch::lazy::Shape> shapes{
       torch::lazy::Shape(out_meta.scalar_type(), out_meta.sizes().vec())};
   return bridge::AtenFromXlaTensor(tensor_methods::log_softmax(
-      bridge::GetXlaTensor(self), dim, c10::nullopt, std::move(shapes)));
+      bridge::GetXlaTensor(self), dim, std::nullopt, std::move(shapes)));
 }
 
 at::Tensor XLANativeFunctions::_log_softmax_backward_data(
@@ -786,7 +786,7 @@ at::Tensor XLANativeFunctions::_softmax(const at::Tensor& self, int64_t dim,
                                         bool /* half_to_float */) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
-      tensor_methods::softmax(bridge::GetXlaTensor(self), dim, c10::nullopt));
+      tensor_methods::softmax(bridge::GetXlaTensor(self), dim, std::nullopt));
 }
 
 at::Tensor XLANativeFunctions::_softmax_backward_data(
@@ -868,7 +868,7 @@ at::Tensor& XLANativeFunctions::arange_out(const at::Scalar& start,
 
 at::Tensor XLANativeFunctions::as_strided_copy(
     const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<int64_t> storage_offset) {
+    std::optional<int64_t> storage_offset) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   // Retrieve the base tensor, if there's one.
   // This function actually operates on the tensor's storage. Since XLA does not
@@ -911,9 +911,9 @@ at::Tensor XLANativeFunctions::as_strided_copy(
   if (storage_size == 0) {
     // Return an empty tensor, if no storage is actually needed.
     return empty_symint(c10::fromIntArrayRefSlow(size), tensor.scalar_type(),
-                        /* layout= */ c10::nullopt, tensor.device(),
-                        /* pin_memory= */ c10::nullopt,
-                        /*  memory_format= */ c10::nullopt);
+                        /* layout= */ std::nullopt, tensor.device(),
+                        /* pin_memory= */ std::nullopt,
+                        /*  memory_format= */ std::nullopt);
   }
 
   // At this point, the following is true:
@@ -978,7 +978,7 @@ at::Tensor XLANativeFunctions::as_strided_copy(
 at::Tensor XLANativeFunctions::as_strided_scatter(
     const at::Tensor& base, const at::Tensor& mutated_view,
     at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<int64_t> storage_offset) {
+    std::optional<int64_t> storage_offset) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   auto base_ = bridge::GetXlaTensor(base);
   auto xsize = XlaHelpers::I64List(size);
@@ -1015,7 +1015,7 @@ at::Tensor XLANativeFunctions::atan2(const at::Tensor& self,
 at::Tensor XLANativeFunctions::avg_pool2d(
     const at::Tensor& self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
     at::IntArrayRef padding, bool ceil_mode, bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(tensor_methods::avg_pool_nd(
       bridge::GetXlaTensor(self), /*spatial_dim_count=*/2,
@@ -1028,7 +1028,7 @@ at::Tensor XLANativeFunctions::avg_pool2d_backward(
     const at::Tensor& grad_output, const at::Tensor& self,
     at::IntArrayRef kernel_size, at::IntArrayRef stride,
     at::IntArrayRef padding, bool ceil_mode, bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if ((ceil_mode && count_include_pad) || divisor_override) {
     return at::native::
@@ -1046,7 +1046,7 @@ at::Tensor XLANativeFunctions::avg_pool2d_backward(
 at::Tensor XLANativeFunctions::avg_pool3d(
     const at::Tensor& self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
     at::IntArrayRef padding, bool ceil_mode, bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(tensor_methods::avg_pool_nd(
       bridge::GetXlaTensor(self), /*spatial_dim_count=*/3,
@@ -1059,7 +1059,7 @@ at::Tensor XLANativeFunctions::avg_pool3d_backward(
     const at::Tensor& grad_output, const at::Tensor& self,
     at::IntArrayRef kernel_size, at::IntArrayRef stride,
     at::IntArrayRef padding, bool ceil_mode, bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
+    std::optional<int64_t> divisor_override) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if ((ceil_mode && count_include_pad) || divisor_override) {
     return at::native::
@@ -1087,7 +1087,7 @@ at::Tensor XLANativeFunctions::baddbmm(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::bernoulli(
-    const at::Tensor& self, c10::optional<at::Generator> generator) {
+    const at::Tensor& self, std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<&xla_cpu_fallback,
@@ -1099,7 +1099,7 @@ at::Tensor XLANativeFunctions::bernoulli(
 }
 
 at::Tensor XLANativeFunctions::bernoulli(
-    const at::Tensor& self, double p, c10::optional<at::Generator> generator) {
+    const at::Tensor& self, double p, std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<
@@ -1111,7 +1111,7 @@ at::Tensor XLANativeFunctions::bernoulli(
 
 at::Tensor& XLANativeFunctions::bernoulli_(
     at::Tensor& self, const at::Tensor& p,
-    c10::optional<at::Generator> generator) {
+    std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<
@@ -1125,8 +1125,8 @@ at::Tensor& XLANativeFunctions::bernoulli_(
 
 at::Tensor XLANativeFunctions::binary_cross_entropy_with_logits(
     const at::Tensor& self, const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& pos_weight, int64_t reduction) {
+    const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& pos_weight, int64_t reduction) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return at::native::binary_cross_entropy_with_logits(
       self, target, IsDefined(weight) ? *weight : at::Tensor(),
@@ -1190,8 +1190,8 @@ at::Tensor& XLANativeFunctions::celu_(at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::clamp(const at::Tensor& self,
-                                     const c10::optional<at::Scalar>& min,
-                                     const c10::optional<at::Scalar>& max) {
+                                     const std::optional<at::Scalar>& min,
+                                     const std::optional<at::Scalar>& max) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
       tensor_methods::clamp(bridge::GetXlaTensor(self), min, max));
@@ -1201,19 +1201,19 @@ at::Tensor XLANativeFunctions::clamp_max(const at::Tensor& self,
                                          const at::Scalar& max) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
-      tensor_methods::clamp(bridge::GetXlaTensor(self), c10::nullopt, max));
+      tensor_methods::clamp(bridge::GetXlaTensor(self), std::nullopt, max));
 }
 
 at::Tensor XLANativeFunctions::clamp_min(const at::Tensor& self,
                                          const at::Scalar& min) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
-      tensor_methods::clamp(bridge::GetXlaTensor(self), min, c10::nullopt));
+      tensor_methods::clamp(bridge::GetXlaTensor(self), min, std::nullopt));
 }
 
 at::Tensor XLANativeFunctions::clone(
     const at::Tensor& self,
-    c10::optional<at::MemoryFormat> /* memory_format */) {
+    std::optional<at::MemoryFormat> /* memory_format */) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
       tensor_methods::clone(bridge::GetXlaTensor(self)));
@@ -1230,7 +1230,7 @@ at::Tensor XLANativeFunctions::constant_pad_nd(const at::Tensor& self,
 // This functions covers the whole convolution lowering.
 at::Tensor XLANativeFunctions::convolution_overrideable(
     const at::Tensor& input, const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias, at::IntArrayRef stride,
+    const std::optional<at::Tensor>& bias, at::IntArrayRef stride,
     at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed,
     at::IntArrayRef output_padding, int64_t groups) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
@@ -1286,7 +1286,7 @@ at::Tensor& XLANativeFunctions::copy_(at::Tensor& self, const at::Tensor& src,
 
 at::Tensor XLANativeFunctions::cross(const at::Tensor& self,
                                      const at::Tensor& other,
-                                     c10::optional<int64_t> dim) {
+                                     std::optional<int64_t> dim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(tensor_methods::cross(
       bridge::GetXlaTensor(self), bridge::GetXlaTensor(other),
@@ -1294,10 +1294,10 @@ at::Tensor XLANativeFunctions::cross(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::cumprod(const at::Tensor& self, int64_t dim,
-                                       c10::optional<at::ScalarType> dtype) {
+                                       std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
-  c10::optional<at::ScalarType> promoted_dtype =
+  std::optional<at::ScalarType> promoted_dtype =
       PromoteIntegralType(self_tensor->dtype(), dtype);
   if (IsOperationOnType(promoted_dtype, self_tensor->dtype(),
                         at::ScalarType::Long)) {
@@ -1311,7 +1311,7 @@ at::Tensor XLANativeFunctions::cumprod(const at::Tensor& self, int64_t dim,
 }
 
 at::Tensor XLANativeFunctions::cumsum(const at::Tensor& self, int64_t dim,
-                                      c10::optional<at::ScalarType> dtype) {
+                                      std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   return bridge::AtenFromXlaTensor(
@@ -1358,12 +1358,12 @@ at::Tensor XLANativeFunctions::diagonal_scatter(const at::Tensor& base,
 at::Tensor XLANativeFunctions::div(const at::Tensor& self,
                                    const at::Tensor& other) {
   return torch_xla::XLANativeFunctions::div(self, other,
-                                            /*rounding_mode=*/c10::nullopt);
+                                            /*rounding_mode=*/std::nullopt);
 }
 
 at::Tensor XLANativeFunctions::div(
     const at::Tensor& self, const at::Tensor& other,
-    c10::optional<c10::string_view> rounding_mode) {
+    std::optional<c10::string_view> rounding_mode) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   at::ScalarType dtype = at::result_type(self, other);
   auto operands = GetBinaryOperands(self, UnwrapNumber(other, dtype));
@@ -1461,7 +1461,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::_embedding_bag_forward_only(
     const at::Tensor& weight, const at::Tensor& indices,
     const at::Tensor& offsets, bool scale_grad_by_freq, int64_t mode,
-    bool sparse, const c10::optional<at::Tensor>& per_sample_weights,
+    bool sparse, const std::optional<at::Tensor>& per_sample_weights,
     bool include_last_offset, int64_t padding_idx) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (mode == 1 || scale_grad_by_freq || sparse || padding_idx != -1) {
@@ -1490,12 +1490,12 @@ XLANativeFunctions::_embedding_bag_forward_only(
 }
 
 at::Tensor XLANativeFunctions::empty_symint(
-    at::SymIntArrayRef sym_size, c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout, c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory,
-    c10::optional<at::MemoryFormat> /* memory_format */) {
+    at::SymIntArrayRef sym_size, std::optional<at::ScalarType> dtype,
+    std::optional<at::Layout> layout, std::optional<at::Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<at::MemoryFormat> /* memory_format */) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  c10::optional<at::IntArrayRef> int_sizes =
+  std::optional<at::IntArrayRef> int_sizes =
       c10::asIntArrayRefSlowOpt(sym_size);
   bool all_dims_static = int_sizes.has_value();
   // PT empty*() are optimizations to avoid initializing the data when it is
@@ -1514,26 +1514,26 @@ at::Tensor XLANativeFunctions::empty_symint(
 
 at::Tensor XLANativeFunctions::empty_strided_symint(
     at::SymIntArrayRef sym_size, at::SymIntArrayRef sym_stride,
-    c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    std::optional<at::ScalarType> dtype, std::optional<at::Layout> layout,
+    std::optional<at::Device> device, std::optional<bool> pin_memory) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  c10::optional<at::IntArrayRef> size = c10::asIntArrayRefSlowOpt(sym_size);
+  std::optional<at::IntArrayRef> size = c10::asIntArrayRefSlowOpt(sym_size);
   bool is_size_dynamic = !size.has_value();
-  c10::optional<at::IntArrayRef> stride = c10::asIntArrayRefSlowOpt(sym_stride);
+  std::optional<at::IntArrayRef> stride = c10::asIntArrayRefSlowOpt(sym_stride);
   bool is_stride_dynamic = !stride.has_value();
   // As XLATensor doesn't have a storage, it should not care about the memory
   // format or how to jump to the next element (strides). So the term stride
   // does not mean much to us. The size of the tensor has been set by the
   // above `empty_symint` so we feel it is ok to return here.
   return empty_symint(sym_size, dtype, layout, device, pin_memory,
-                      c10::nullopt);
+                      std::nullopt);
 }
 
 at::Tensor XLANativeFunctions::expand_copy_symint(const at::Tensor& self,
                                                   at::SymIntArrayRef sym_size,
                                                   bool implicit) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  c10::optional<at::IntArrayRef> size = c10::asIntArrayRefSlowOpt(sym_size);
+  std::optional<at::IntArrayRef> size = c10::asIntArrayRefSlowOpt(sym_size);
   if (size.has_value()) {
     return bridge::AtenFromXlaTensor(tensor_methods::expand(
         bridge::GetXlaTensor(self), torch::lazy::ToVector<int64_t>(*size)));
@@ -1546,7 +1546,7 @@ at::Tensor XLANativeFunctions::expand_copy_symint(const at::Tensor& self,
 }
 
 at::Tensor& XLANativeFunctions::exponential_(
-    at::Tensor& self, double lambd, c10::optional<at::Generator> generator) {
+    at::Tensor& self, double lambd, std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<&xla_cpu_fallback,
@@ -1626,10 +1626,10 @@ at::Tensor XLANativeFunctions::fmod(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::full(at::IntArrayRef size,
                                     const at::Scalar& fill_value,
-                                    c10::optional<at::ScalarType> dtype,
-                                    c10::optional<at::Layout> layout,
-                                    c10::optional<at::Device> device,
-                                    c10::optional<bool> pin_memory) {
+                                    std::optional<at::ScalarType> dtype,
+                                    std::optional<at::Layout> layout,
+                                    std::optional<at::Device> device,
+                                    std::optional<bool> pin_memory) {
   TORCH_LAZY_FN_COUNTER("xla::");
   // Fall back to CPU if layout or pin_memory are not default
   if (layout.value_or(at::Layout::Strided) != at::Layout::Strided ||
@@ -1696,11 +1696,11 @@ at::Tensor XLANativeFunctions::hardtanh_backward(const at::Tensor& grad_output,
 
 at::Tensor XLANativeFunctions::index(
     const at::Tensor& self,
-    const c10::List<c10::optional<at::Tensor>>& indices) {
+    const c10::List<std::optional<at::Tensor>>& indices) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   bool indices_on_cpu_or_xla =
       std::all_of(indices.begin(), indices.end(),
-                  [=](const c10::optional<at::Tensor>& opt) {
+                  [=](const std::optional<at::Tensor>& opt) {
                     return opt.has_value() && opt->defined()
                                ? (opt->is_cpu() || bridge::IsXlaTensor(*opt))
                                : true;
@@ -1711,7 +1711,7 @@ at::Tensor XLANativeFunctions::index(
       << " When using XLA, the indexed tensor must be an XLA tensor.";
   CanonicalIndexInfo canonical_index_info =
       GetCanonicalIndexInfo(self, indices);
-  c10::optional<torch::lazy::BackendDevice> device =
+  std::optional<torch::lazy::BackendDevice> device =
       bridge::GetXlaDevice(canonical_index_info.base);
   if (!device.has_value()) {
     device = bridge::GetXlaDevice(canonical_index_info.indices);
@@ -1764,12 +1764,12 @@ at::Tensor& XLANativeFunctions::index_fill_(at::Tensor& self, int64_t dim,
 }
 
 at::Tensor& XLANativeFunctions::index_put_(
-    at::Tensor& self, const c10::List<c10::optional<at::Tensor>>& indices,
+    at::Tensor& self, const c10::List<std::optional<at::Tensor>>& indices,
     const at::Tensor& values, bool accumulate) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   bool indices_on_cpu_or_xla =
       std::all_of(indices.begin(), indices.end(),
-                  [=](const c10::optional<at::Tensor>& opt) {
+                  [=](const std::optional<at::Tensor>& opt) {
                     return opt.has_value() && opt->defined()
                                ? (opt->is_cpu() || bridge::IsXlaTensor(*opt))
                                : true;
@@ -1781,7 +1781,7 @@ at::Tensor& XLANativeFunctions::index_put_(
   XLA_CHECK(self.scalar_type() == values.scalar_type());
   CanonicalIndexInfo canonical_index_info =
       GetCanonicalIndexInfo(self, indices);
-  c10::optional<torch::lazy::BackendDevice> device =
+  std::optional<torch::lazy::BackendDevice> device =
       bridge::GetXlaDevice(canonical_index_info.base);
   if (!device.has_value()) {
     device = bridge::GetXlaDevice(canonical_index_info.indices);
@@ -1902,10 +1902,10 @@ std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::linalg_inv_ex(
 
 at::Tensor XLANativeFunctions::linspace(const at::Scalar& start,
                                         const at::Scalar& end, int64_t steps,
-                                        c10::optional<at::ScalarType> dtype,
-                                        c10::optional<at::Layout> layout,
-                                        c10::optional<at::Device> device,
-                                        c10::optional<bool> pin_memory) {
+                                        std::optional<at::ScalarType> dtype,
+                                        std::optional<at::Layout> layout,
+                                        std::optional<at::Device> device,
+                                        std::optional<bool> pin_memory) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   // Fall back to CPU if layout or pin_memory are not default
   if (layout.value_or(at::Layout::Strided) != at::Layout::Strided ||
@@ -1929,7 +1929,7 @@ at::Tensor XLANativeFunctions::log(const at::Tensor& self) {
 }
 
 at::Tensor XLANativeFunctions::logit(const at::Tensor& self,
-                                     c10::optional<double> eps) {
+                                     std::optional<double> eps) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
       tensor_methods::logit(bridge::GetXlaTensor(self), eps));
@@ -2140,7 +2140,7 @@ at::Tensor XLANativeFunctions::max_unpool3d(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::mean(const at::Tensor& self,
-                                    c10::optional<at::ScalarType> dtype) {
+                                    std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   return bridge::AtenFromXlaTensor(tensor_methods::mean(
@@ -2151,7 +2151,7 @@ at::Tensor XLANativeFunctions::mean(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::mean(const at::Tensor& self,
                                     at::OptionalIntArrayRef dim, bool keepdim,
-                                    c10::optional<at::ScalarType> dtype) {
+                                    std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   return bridge::AtenFromXlaTensor(tensor_methods::mean(
@@ -2222,7 +2222,7 @@ at::Tensor XLANativeFunctions::mul(const at::Tensor& self,
                                    const at::Tensor& other) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   using FnType = XLATensorPtr(const XLATensorPtr&, const XLATensorPtr&,
-                              c10::optional<at::ScalarType>);
+                              std::optional<at::ScalarType>);
   return OpConfig::From(static_cast<FnType*>(tensor_methods::mul))
       .add_input(self)
       .add_input(other)
@@ -2243,7 +2243,7 @@ at::Tensor XLANativeFunctions::mul(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::multinomial(
     const at::Tensor& self, int64_t num_samples, bool replacement,
-    c10::optional<at::Generator> generator) {
+    std::optional<at::Generator> generator) {
   XLA_CHECK(num_samples > 0)
       << "Multinomial number of samples must be greater than 0";
   XLA_CHECK(at::isFloatingType(self.scalar_type()))
@@ -2281,9 +2281,9 @@ at::Tensor& XLANativeFunctions::mv_out(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::nan_to_num(const at::Tensor& self,
-                                          c10::optional<double> nan,
-                                          c10::optional<double> posinf,
-                                          c10::optional<double> neginf) {
+                                          std::optional<double> nan,
+                                          std::optional<double> posinf,
+                                          std::optional<double> neginf) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   // nan_to_num doesn't apply to integer types.
   if (!at::native::is_floating_point(self)) {
@@ -2310,10 +2310,10 @@ at::Tensor XLANativeFunctions::nan_to_num(const at::Tensor& self,
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::native_batch_norm(
-    const at::Tensor& input, const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias,
-    const c10::optional<at::Tensor>& running_mean,
-    const c10::optional<at::Tensor>& running_var, bool training,
+    const at::Tensor& input, const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& running_mean,
+    const std::optional<at::Tensor>& running_var, bool training,
     double momentum, double eps) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr input_tensor = bridge::GetXlaTensor(input);
@@ -2333,8 +2333,8 @@ XLANativeFunctions::native_batch_norm(
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::_native_batch_norm_legit(
-    const at::Tensor& input, const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias, at::Tensor& running_mean,
+    const at::Tensor& input, const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& bias, at::Tensor& running_mean,
     at::Tensor& running_var, bool training, double momentum, double eps) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr input_tensor = bridge::GetXlaTensor(input);
@@ -2352,8 +2352,8 @@ XLANativeFunctions::_native_batch_norm_legit(
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::_native_batch_norm_legit(
-    const at::Tensor& input, const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias, bool training, double momentum,
+    const at::Tensor& input, const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& bias, bool training, double momentum,
     double eps) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr input_tensor = bridge::GetXlaTensor(input);
@@ -2372,11 +2372,11 @@ XLANativeFunctions::_native_batch_norm_legit(
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::native_batch_norm_backward(
     const at::Tensor& grad_out, const at::Tensor& input,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& running_mean,
-    const c10::optional<at::Tensor>& running_var,
-    const c10::optional<at::Tensor>& save_mean,
-    const c10::optional<at::Tensor>& save_invstd, bool train, double eps,
+    const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& running_mean,
+    const std::optional<at::Tensor>& running_var,
+    const std::optional<at::Tensor>& save_mean,
+    const std::optional<at::Tensor>& save_invstd, bool train, double eps,
     std::array<bool, 3> output_mask) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr grad_out_tensor = bridge::GetXlaTensor(grad_out);
@@ -2397,7 +2397,7 @@ XLANativeFunctions::native_batch_norm_backward(
 }
 
 std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::native_dropout(
-    const at::Tensor& self, double p, c10::optional<bool> train) {
+    const at::Tensor& self, double p, std::optional<bool> train) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   auto results = tensor_methods::native_dropout(self_tensor, p, train);
@@ -2417,7 +2417,7 @@ at::Tensor XLANativeFunctions::neg(const at::Tensor& self) {
 
 at::Tensor XLANativeFunctions::nll_loss2d_backward(
     const at::Tensor& grad_output, const at::Tensor& self,
-    const at::Tensor& target, const c10::optional<at::Tensor>& weight,
+    const at::Tensor& target, const std::optional<at::Tensor>& weight,
     int64_t reduction, int64_t ignore_index, const at::Tensor& total_weight) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
@@ -2436,7 +2436,7 @@ at::Tensor XLANativeFunctions::nll_loss2d_backward(
 
 std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::nll_loss2d_forward(
     const at::Tensor& self, const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight, int64_t reduction,
+    const std::optional<at::Tensor>& weight, int64_t reduction,
     int64_t ignore_index) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
@@ -2452,7 +2452,7 @@ std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::nll_loss2d_forward(
 
 at::Tensor XLANativeFunctions::nll_loss_backward(
     const at::Tensor& grad_output, const at::Tensor& self,
-    const at::Tensor& target, const c10::optional<at::Tensor>& weight,
+    const at::Tensor& target, const std::optional<at::Tensor>& weight,
     int64_t reduction, int64_t ignore_index, const at::Tensor& total_weight) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
@@ -2471,7 +2471,7 @@ at::Tensor XLANativeFunctions::nll_loss_backward(
 
 std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::nll_loss_forward(
     const at::Tensor& self, const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight, int64_t reduction,
+    const std::optional<at::Tensor>& weight, int64_t reduction,
     int64_t ignore_index) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
@@ -2497,7 +2497,7 @@ at::Tensor XLANativeFunctions::nonzero(const at::Tensor& self) {
 }
 
 at::Tensor XLANativeFunctions::norm(const at::Tensor& self,
-                                    const c10::optional<at::Scalar>& p,
+                                    const std::optional<at::Scalar>& p,
                                     at::ScalarType dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   // If p==0 it is a torch.nonzero(), which is not lowered to XLA due to dynamic
@@ -2521,11 +2521,11 @@ at::Tensor XLANativeFunctions::norm(const at::Tensor& self,
                                         ATEN_OP2(norm, Scalar)>::call(self, p);
   }
   return bridge::AtenFromXlaTensor(tensor_methods::norm(
-      bridge::GetXlaTensor(self), p, c10::nullopt, {}, /*keepdim=*/false));
+      bridge::GetXlaTensor(self), p, std::nullopt, {}, /*keepdim=*/false));
 }
 
 at::Tensor XLANativeFunctions::norm(const at::Tensor& self,
-                                    const c10::optional<at::Scalar>& p,
+                                    const std::optional<at::Scalar>& p,
                                     at::IntArrayRef dim, bool keepdim,
                                     at::ScalarType dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
@@ -2543,7 +2543,7 @@ at::Tensor XLANativeFunctions::norm(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::norm(const at::Tensor& self,
-                                    const c10::optional<at::Scalar>& p,
+                                    const std::optional<at::Scalar>& p,
                                     at::IntArrayRef dim, bool keepdim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   // If p==0 it is a torch.nonzero(), which is not lowered to XLA due to dynamic
@@ -2554,11 +2554,11 @@ at::Tensor XLANativeFunctions::norm(const at::Tensor& self,
                                                                 keepdim);
   }
   return bridge::AtenFromXlaTensor(tensor_methods::norm(
-      bridge::GetXlaTensor(self), p, c10::nullopt, dim, keepdim));
+      bridge::GetXlaTensor(self), p, std::nullopt, dim, keepdim));
 }
 
 at::Tensor XLANativeFunctions::normal(const at::Tensor& mean, double std,
-                                      c10::optional<at::Generator> generator) {
+                                      std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<
@@ -2570,7 +2570,7 @@ at::Tensor XLANativeFunctions::normal(const at::Tensor& mean, double std,
 }
 
 at::Tensor XLANativeFunctions::normal(double mean, const at::Tensor& std,
-                                      c10::optional<at::Generator> generator) {
+                                      std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<
@@ -2583,7 +2583,7 @@ at::Tensor XLANativeFunctions::normal(double mean, const at::Tensor& std,
 
 at::Tensor XLANativeFunctions::normal(const at::Tensor& mean,
                                       const at::Tensor& std,
-                                      c10::optional<at::Generator> generator) {
+                                      std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<
@@ -2596,7 +2596,7 @@ at::Tensor XLANativeFunctions::normal(const at::Tensor& mean,
 
 at::Tensor& XLANativeFunctions::normal_(
     at::Tensor& self, double mean, double std,
-    c10::optional<at::Generator> generator) {
+    std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<&xla_cpu_fallback,
@@ -2619,7 +2619,7 @@ at::Tensor XLANativeFunctions::pow(const at::Tensor& self,
                                    const at::Scalar& exponent) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr (*method_pow)(const XLATensorPtr&, const at::Scalar&,
-                             c10::optional<at::ScalarType>) =
+                             std::optional<at::ScalarType>) =
       tensor_methods::pow;
   return DoBinaryOp(self, exponent, method_pow);
 }
@@ -2628,7 +2628,7 @@ at::Tensor XLANativeFunctions::pow(const at::Tensor& self,
                                    const at::Tensor& exponent) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr (*method_pow)(const XLATensorPtr&, const XLATensorPtr&,
-                             c10::optional<at::ScalarType>) =
+                             std::optional<at::ScalarType>) =
       tensor_methods::pow;
   return DoBinaryOp(self, exponent, method_pow);
 }
@@ -2637,7 +2637,7 @@ at::Tensor XLANativeFunctions::pow(const at::Scalar& self,
                                    const at::Tensor& exponent) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr (*method_pow)(const at::Scalar&, const XLATensorPtr&,
-                             c10::optional<at::ScalarType>) =
+                             std::optional<at::ScalarType>) =
       tensor_methods::pow;
   return DoBinaryOp(self, exponent, method_pow);
 }
@@ -2681,7 +2681,7 @@ std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::_prelu_kernel_backward(
 }
 
 at::Tensor XLANativeFunctions::prod(const at::Tensor& self,
-                                    c10::optional<at::ScalarType> dtype) {
+                                    std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   return bridge::AtenFromXlaTensor(tensor_methods::prod(
@@ -2693,7 +2693,7 @@ at::Tensor XLANativeFunctions::prod(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::prod(const at::Tensor& self, int64_t dim,
                                     bool keepdim,
-                                    c10::optional<at::ScalarType> dtype) {
+                                    std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   return bridge::AtenFromXlaTensor(
       tensor_methods::prod(bridge::GetXlaTensor(self), {dim}, keepdim,
@@ -2770,8 +2770,8 @@ std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::qr(
 
 // The value generated should be within (from, to].
 at::Tensor& XLANativeFunctions::random_(
-    at::Tensor& self, int64_t from, c10::optional<int64_t> to,
-    c10::optional<at::Generator> generator) {
+    at::Tensor& self, int64_t from, std::optional<int64_t> to,
+    std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<
@@ -2791,7 +2791,7 @@ at::Tensor& XLANativeFunctions::random_(
 
 // The value generated should be in (0, to].
 at::Tensor& XLANativeFunctions::random_(
-    at::Tensor& self, int64_t to, c10::optional<at::Generator> generator) {
+    at::Tensor& self, int64_t to, std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<&xla_cpu_fallback,
@@ -2807,7 +2807,7 @@ at::Tensor& XLANativeFunctions::random_(
 
 // The value generated should be in (self_type_min, self_type_max).
 at::Tensor& XLANativeFunctions::random_(
-    at::Tensor& self, c10::optional<at::Generator> generator) {
+    at::Tensor& self, std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<&xla_cpu_fallback,
@@ -2824,10 +2824,10 @@ at::Tensor& XLANativeFunctions::random_(
 }
 
 at::Tensor XLANativeFunctions::randperm(int64_t n,
-                                        c10::optional<at::ScalarType> dtype,
-                                        c10::optional<at::Layout> layout,
-                                        c10::optional<at::Device> device,
-                                        c10::optional<bool> pin_memory) {
+                                        std::optional<at::ScalarType> dtype,
+                                        std::optional<at::Layout> layout,
+                                        std::optional<at::Device> device,
+                                        std::optional<bool> pin_memory) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
 
   // Only support the basic version of randperm(int64_t) to start. If there are
@@ -2961,7 +2961,7 @@ at::Tensor XLANativeFunctions::replication_pad3d_backward(
 
 const at::Tensor& XLANativeFunctions::resize_(
     const at::Tensor& self, at::IntArrayRef size,
-    c10::optional<at::MemoryFormat> /* memory_format */) {
+    std::optional<at::MemoryFormat> /* memory_format */) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   tensor_methods::resize_(self_tensor, XlaHelpers::I64List(size));
@@ -2980,7 +2980,7 @@ at::Tensor XLANativeFunctions::roll(const at::Tensor& self,
 at::Tensor XLANativeFunctions::rrelu_with_noise(
     const at::Tensor& self, const at::Tensor& noise, const at::Scalar& lower,
     const at::Scalar& upper, bool training,
-    c10::optional<at::Generator> generator) {
+    std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     // The fallback path for rrelu_with_noise when training=true is wrong
@@ -3034,7 +3034,7 @@ at::Tensor XLANativeFunctions::rsub(const at::Tensor& self,
 
 at::Tensor scatter_reduce_helper(const at::Tensor& self, int64_t dim,
                                  const at::Tensor& index, const at::Tensor& src,
-                                 c10::optional<c10::string_view> reduce) {
+                                 std::optional<c10::string_view> reduce) {
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   if (!reduce.has_value()) {
     return bridge::AtenFromXlaTensor(
@@ -3055,7 +3055,7 @@ at::Tensor scatter_reduce_helper(const at::Tensor& self, int64_t dim,
 at::Tensor scatter_reduce_helper(const at::Tensor& self, int64_t dim,
                                  const at::Tensor& index,
                                  const at::Scalar& value,
-                                 c10::optional<c10::string_view> reduce) {
+                                 std::optional<c10::string_view> reduce) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   if (!reduce.has_value()) {
@@ -3077,14 +3077,14 @@ at::Tensor XLANativeFunctions::scatter(const at::Tensor& self, int64_t dim,
                                        const at::Tensor& index,
                                        const at::Tensor& src) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  return scatter_reduce_helper(self, dim, index, src, c10::nullopt);
+  return scatter_reduce_helper(self, dim, index, src, std::nullopt);
 }
 
 at::Tensor XLANativeFunctions::scatter(const at::Tensor& self, int64_t dim,
                                        const at::Tensor& index,
                                        const at::Scalar& value) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  return scatter_reduce_helper(self, dim, index, value, c10::nullopt);
+  return scatter_reduce_helper(self, dim, index, value, std::nullopt);
 }
 
 at::Tensor XLANativeFunctions::scatter(const at::Tensor& self, int64_t dim,
@@ -3191,8 +3191,8 @@ at::Tensor XLANativeFunctions::sigmoid_backward(const at::Tensor& grad_output,
 }
 
 at::Tensor XLANativeFunctions::slice_copy(const at::Tensor& self, int64_t dim,
-                                          c10::optional<int64_t> start,
-                                          c10::optional<int64_t> end,
+                                          std::optional<int64_t> start,
+                                          std::optional<int64_t> end,
                                           int64_t step) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   int64_t start_val = start.has_value() ? start.value() : 0;
@@ -3205,7 +3205,7 @@ at::Tensor XLANativeFunctions::slice_copy(const at::Tensor& self, int64_t dim,
 
 at::Tensor XLANativeFunctions::slice_scatter(
     const at::Tensor& base, const at::Tensor& mutated_view, int64_t dim,
-    c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
+    std::optional<int64_t> start, std::optional<int64_t> end, int64_t step) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   auto base_ = bridge::GetXlaTensor(base);
   auto mutated_view_ = bridge::GetXlaTensor(mutated_view);
@@ -3279,7 +3279,7 @@ std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::sort(
 }
 
 std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::sort(
-    const at::Tensor& self, c10::optional<bool> stable, int64_t dim,
+    const at::Tensor& self, std::optional<bool> stable, int64_t dim,
     bool descending) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   auto results = tensor_methods::topk(
@@ -3360,7 +3360,7 @@ at::Tensor XLANativeFunctions::std(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::std(const at::Tensor& self,
                                    at::OptionalIntArrayRef dim,
-                                   const c10::optional<c10::Scalar>& correction,
+                                   const std::optional<c10::Scalar>& correction,
                                    bool keepdim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
@@ -3373,7 +3373,7 @@ at::Tensor XLANativeFunctions::std(const at::Tensor& self,
 
 std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::std_mean(
     const at::Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<c10::Scalar>& correction, bool keepdim) {
+    const std::optional<c10::Scalar>& correction, bool keepdim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   auto results = tensor_methods::std_mean(
@@ -3418,7 +3418,7 @@ at::Tensor XLANativeFunctions::sub(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::sum(const at::Tensor& self,
-                                   c10::optional<at::ScalarType> dtype) {
+                                   std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   return bridge::AtenFromXlaTensor(tensor_methods::sum(
@@ -3429,7 +3429,7 @@ at::Tensor XLANativeFunctions::sum(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::sum(const at::Tensor& self,
                                    at::OptionalIntArrayRef dim, bool keepdim,
-                                   c10::optional<at::ScalarType> dtype) {
+                                   std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   return bridge::AtenFromXlaTensor(tensor_methods::sum(
@@ -3523,7 +3523,7 @@ std::vector<at::Tensor> XLANativeFunctions::unbind_copy(const at::Tensor& self,
 
 at::Tensor& XLANativeFunctions::uniform_(
     at::Tensor& self, double from, double to,
-    c10::optional<at::Generator> generator) {
+    std::optional<at::Generator> generator) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   if (generator.has_value() && generator->defined()) {
     return at::native::call_fallback_fn<&xla_cpu_fallback,
@@ -3544,7 +3544,7 @@ at::Tensor XLANativeFunctions::unsqueeze_copy(const at::Tensor& self,
 
 at::Tensor XLANativeFunctions::upsample_bilinear2d(
     const at::Tensor& self, at::IntArrayRef output_size, bool align_corners,
-    c10::optional<double> scales_h, c10::optional<double> scales_w) {
+    std::optional<double> scales_h, std::optional<double> scales_w) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   absl::Span<const int64_t> input_dims =
@@ -3567,7 +3567,7 @@ at::Tensor XLANativeFunctions::upsample_bilinear2d(
 at::Tensor XLANativeFunctions::upsample_bilinear2d_backward(
     const at::Tensor& grad_output, at::IntArrayRef output_size,
     at::IntArrayRef input_size, bool align_corners,
-    c10::optional<double> scales_h, c10::optional<double> scales_w) {
+    std::optional<double> scales_h, std::optional<double> scales_w) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr grad_output_tensor = bridge::GetXlaTensor(grad_output);
   // Only the XLA TPU backend for now implements the CustomCall required by
@@ -3599,7 +3599,7 @@ at::Tensor XLANativeFunctions::upsample_bilinear2d_backward(
 
 at::Tensor XLANativeFunctions::upsample_nearest2d(
     const at::Tensor& self, at::IntArrayRef output_size,
-    c10::optional<double> scales_h, c10::optional<double> scales_w) {
+    std::optional<double> scales_h, std::optional<double> scales_w) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   absl::Span<const int64_t> input_dims =
@@ -3621,8 +3621,8 @@ at::Tensor XLANativeFunctions::upsample_nearest2d(
 
 at::Tensor XLANativeFunctions::upsample_nearest2d_backward(
     const at::Tensor& grad_output, at::IntArrayRef output_size,
-    at::IntArrayRef input_size, c10::optional<double> scales_h,
-    c10::optional<double> scales_w) {
+    at::IntArrayRef input_size, std::optional<double> scales_h,
+    std::optional<double> scales_w) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr grad_output_tensor = bridge::GetXlaTensor(grad_output);
   // Only the XLA TPU backend for now implements the CustomCall required by
@@ -3654,7 +3654,7 @@ at::Tensor XLANativeFunctions::upsample_nearest2d_backward(
 
 at::Tensor XLANativeFunctions::var(const at::Tensor& self,
                                    at::OptionalIntArrayRef dim,
-                                   const c10::optional<c10::Scalar>& correction,
+                                   const std::optional<c10::Scalar>& correction,
                                    bool keepdim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
@@ -3668,7 +3668,7 @@ at::Tensor XLANativeFunctions::var(const at::Tensor& self,
 
 std::tuple<at::Tensor, at::Tensor> XLANativeFunctions::var_mean(
     const at::Tensor& self, at::OptionalIntArrayRef dim,
-    const c10::optional<c10::Scalar>& correction, bool keepdim) {
+    const std::optional<c10::Scalar>& correction, bool keepdim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   auto results = tensor_methods::var_mean(
@@ -3710,7 +3710,7 @@ at::Tensor XLANativeFunctions::view_as_real_copy(const at::Tensor& self) {
 at::Tensor XLANativeFunctions::view_copy_symint(const at::Tensor& self,
                                                 at::SymIntArrayRef shape) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  c10::optional<at::IntArrayRef> int_shape = c10::asIntArrayRefSlowOpt(shape);
+  std::optional<at::IntArrayRef> int_shape = c10::asIntArrayRefSlowOpt(shape);
   bool input_shape_static = int_shape.has_value();
   XLATensorPtr xla_input = bridge::GetXlaTensor(self);
   bool input_has_dyn_shape = xla_input->shape().get().is_dynamic();
@@ -3744,7 +3744,7 @@ at::Tensor& XLANativeFunctions::zero_(at::Tensor& self) {
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> XLANativeFunctions::_linalg_svd(
     const at::Tensor& self, bool full_matrices, bool compute_uv,
-    c10::optional<c10::string_view> /* driver */) {
+    std::optional<c10::string_view> /* driver */) {
   // The optional driver string is only for CUDA with a cuSOLVER backend.
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   // As per https://pytorch.org/docs/stable/generated/torch.svd.html,
@@ -3786,8 +3786,8 @@ at::Scalar XLANativeFunctions::_local_scalar_dense(const at::Tensor& self) {
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::native_layer_norm(const at::Tensor& input,
                                       at::IntArrayRef normalized_shape,
-                                      const c10::optional<at::Tensor>& weight,
-                                      const c10::optional<at::Tensor>& bias,
+                                      const std::optional<at::Tensor>& weight,
+                                      const std::optional<at::Tensor>& bias,
                                       double eps) {
   return at::native::math_native_layer_norm(input, normalized_shape, weight,
                                             bias, eps);
@@ -3797,8 +3797,8 @@ XLANativeFunctions::native_layer_norm(const at::Tensor& input,
 // backwards formula for native_group_norm
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 XLANativeFunctions::native_group_norm(const at::Tensor& input,
-                                      const c10::optional<at::Tensor>& weight,
-                                      const c10::optional<at::Tensor>& bias,
+                                      const std::optional<at::Tensor>& weight,
+                                      const std::optional<at::Tensor>& bias,
                                       int64_t N, int64_t C, int64_t HxW,
                                       int64_t group, double eps) {
   return at::native::math_group_norm(input, weight, bias, N, C, HxW, group,
@@ -3807,7 +3807,7 @@ XLANativeFunctions::native_group_norm(const at::Tensor& input,
 
 at::Tensor XLANativeFunctions::_cdist_forward(
     const at::Tensor& x1, const at::Tensor& x2, double p,
-    c10::optional<int64_t> compute_mode) {
+    std::optional<int64_t> compute_mode) {
   // compute_mode is ignored because the use_mm_for_euclid_dist lowering
   // (compute_mode is 0 or 1) is achieved through composite ops from
   // native pytorch.
@@ -3849,7 +3849,7 @@ at::Tensor XLANativeFunctions::block_diag(at::TensorList tensors) {
 
 at::Tensor XLANativeFunctions::_convolution(
     const at::Tensor& input, const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias, at::IntArrayRef stride,
+    const std::optional<at::Tensor>& bias, at::IntArrayRef stride,
     at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed,
     at::IntArrayRef output_padding, int64_t groups, bool benchmark,
     bool deterministic, bool cudnn_enabled, bool allow_tf32) {
@@ -3906,7 +3906,7 @@ XLANativeFunctions::convolution_backward(
 }
 
 at::Tensor XLANativeFunctions::count_nonzero(const at::Tensor& self,
-                                             c10::optional<int64_t> dim) {
+                                             std::optional<int64_t> dim) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr xla_tensor = bridge::GetXlaTensor(self);
   std::vector<int64_t> dims;
@@ -3973,8 +3973,8 @@ at::Tensor XLANativeFunctions::_euclidean_dist(const at::Tensor& x1,
 
 at::Tensor XLANativeFunctions::new_empty_strided_symint(
     const at::Tensor& self, at::SymIntArrayRef size, at::SymIntArrayRef stride,
-    c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    std::optional<at::ScalarType> dtype, std::optional<at::Layout> layout,
+    std::optional<at::Device> device, std::optional<bool> pin_memory) {
   // See Note: [Disabling functionalization]
   if (runtime::sys_util::GetEnvBool("XLA_DISABLE_FUNCTIONALIZATION", false)) {
     return at::native::new_empty_strided_symint(self, size, stride, dtype,
@@ -4032,8 +4032,8 @@ at::Tensor XLANativeFunctions::select_symint(const at::Tensor& self,
 }
 
 at::Tensor XLANativeFunctions::slice(const at::Tensor& self, int64_t dim,
-                                     c10::optional<int64_t> start,
-                                     c10::optional<int64_t> end, int64_t step) {
+                                     std::optional<int64_t> start,
+                                     std::optional<int64_t> end, int64_t step) {
   // See Note: [Disabling functionalization]
   if (runtime::sys_util::GetEnvBool("XLA_DISABLE_FUNCTIONALIZATION", false)) {
     return slice_copy(self, dim, start, end, step);
@@ -4062,8 +4062,8 @@ at::Tensor XLANativeFunctions::_trilinear(
 }
 
 at::Tensor XLANativeFunctions::linalg_pinv(
-    const at::Tensor& self, const c10::optional<at::Tensor>& atol,
-    const c10::optional<at::Tensor>& rtol, bool hermitian) {
+    const at::Tensor& self, const std::optional<at::Tensor>& atol,
+    const std::optional<at::Tensor>& rtol, bool hermitian) {
   XLA_CHECK(
       !runtime::sys_util::GetEnvBool("XLA_DISABLE_FUNCTIONALIZATION", false));
   return at::functionalization::functionalize_aten_op<ATEN_OP2(
@@ -4079,7 +4079,7 @@ at::Tensor XLANativeFunctions::mvlgamma(const at::Tensor& self, int64_t p) {
 
 at::Tensor XLANativeFunctions::linalg_vector_norm(
     const at::Tensor& self, const at::Scalar& ord, at::OptionalIntArrayRef dim,
-    bool keepdim, c10::optional<at::ScalarType> dtype) {
+    bool keepdim, std::optional<at::ScalarType> dtype) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLA_CHECK(at::isFloatingType(self.scalar_type()))
       << "Input must be a floating type";
@@ -4126,7 +4126,7 @@ at::Tensor XLANativeFunctions::permute(const at::Tensor& self,
 // For ops below, see note [Disabling Functionalization]
 at::Tensor XLANativeFunctions::as_strided(
     const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<int64_t> storage_offset) {
+    std::optional<int64_t> storage_offset) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   const auto& base = bridge::GetXlaTensor(self)->Base();
   const auto& tensor = base.defined() ? base : self;
@@ -4146,7 +4146,7 @@ at::Tensor XLANativeFunctions::as_strided(
 
 const at::Tensor& XLANativeFunctions::as_strided_(
     const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
-    c10::optional<int64_t> storage_offset) {
+    std::optional<int64_t> storage_offset) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
   XLATensorPtr self_tensor = bridge::GetXlaTensor(self);
   auto xsize = XlaHelpers::I64List(size);
@@ -4173,7 +4173,7 @@ at::Tensor XLANativeFunctions::expand_symint(const at::Tensor& self,
                                              at::SymIntArrayRef sym_size,
                                              bool implicit) {
   TORCH_LAZY_FN_COUNTER_TIMED_TRACING("xla::");
-  c10::optional<at::IntArrayRef> size = c10::asIntArrayRefSlowOpt(sym_size);
+  std::optional<at::IntArrayRef> size = c10::asIntArrayRefSlowOpt(sym_size);
   if (size.has_value()) {
     return bridge::AtenFromXlaTensor(tensor_methods::expand(
         bridge::GetXlaTensor(self), torch::lazy::ToVector<int64_t>(*size)));
diff --git a/torch_xla/csrc/convert_ops.cpp b/torch_xla/csrc/convert_ops.cpp
index cd86e0f3169..4088ad44249 100644
--- a/torch_xla/csrc/convert_ops.cpp
+++ b/torch_xla/csrc/convert_ops.cpp
@@ -80,7 +80,7 @@ xla::XlaOp ConvertToNumeric(xla::XlaOp op) {
 }
 
 xla::XlaOp CastToScalarType(xla::XlaOp input,
-                            c10::optional<at::ScalarType> dtype) {
+                            std::optional<at::ScalarType> dtype) {
   if (dtype) {
     torch::lazy::BackendDevice xla_device = bridge::GetCurrentDevice();
     return ConvertTo(input, XlaHelpers::TypeOfXlaOp(input),
diff --git a/torch_xla/csrc/convert_ops.h b/torch_xla/csrc/convert_ops.h
index 029599667bd..71c72d1062f 100644
--- a/torch_xla/csrc/convert_ops.h
+++ b/torch_xla/csrc/convert_ops.h
@@ -24,7 +24,7 @@ xla::XlaOp ConvertToNumeric(xla::XlaOp op);
 // Cast the input to the given dtype. If dtype is null, no-op with the exception
 // of predicates, which are converted to 8-bit unsigned integers.
 xla::XlaOp CastToScalarType(xla::XlaOp input,
-                            c10::optional<at::ScalarType> dtype);
+                            std::optional<at::ScalarType> dtype);
 
 xla::XlaOp MaybeConvertTo(xla::XlaOp input, xla::PrimitiveType type);
 
diff --git a/torch_xla/csrc/data_ops.cpp b/torch_xla/csrc/data_ops.cpp
index 79b339500e0..8e60c235a4b 100644
--- a/torch_xla/csrc/data_ops.cpp
+++ b/torch_xla/csrc/data_ops.cpp
@@ -55,7 +55,7 @@ bool IsSparseGather(xla::XlaOp input, xla::XlaOp index, int64_t dim) {
 
 std::vector<int64_t> GetCompleteShape(absl::Span<const int64_t> output_sizes,
                                       absl::Span<const int64_t> input_sizes) {
-  c10::optional<size_t> incomplete_dim;
+  std::optional<size_t> incomplete_dim;
   int64_t incomplete_element_count = 1;
   for (size_t dim = 0; dim < output_sizes.size(); ++dim) {
     int64_t dim_size = output_sizes[dim];
diff --git a/torch_xla/csrc/elementwise.cpp b/torch_xla/csrc/elementwise.cpp
index 4facf43f6c8..fb89c397e8e 100644
--- a/torch_xla/csrc/elementwise.cpp
+++ b/torch_xla/csrc/elementwise.cpp
@@ -413,7 +413,7 @@ xla::XlaOp BuildLogSigmoidBackward(xla::XlaOp grad_output, xla::XlaOp input,
   return grad_output * (xla::Neg(max_deriv) - sign * (buffer - one) / buffer);
 }
 
-xla::XlaOp BuildLogit(xla::XlaOp input, c10::optional<double> eps) {
+xla::XlaOp BuildLogit(xla::XlaOp input, std::optional<double> eps) {
   const xla::Shape& shape = ShapeHelper::ShapeOfXlaOp(input);
   xla::XlaOp one = XlaHelpers::ScalarValue<float>(1.0, shape.element_type(),
                                                   input.builder());
diff --git a/torch_xla/csrc/elementwise.h b/torch_xla/csrc/elementwise.h
index 947a48dbe60..b7d61eb4b23 100644
--- a/torch_xla/csrc/elementwise.h
+++ b/torch_xla/csrc/elementwise.h
@@ -98,7 +98,7 @@ std::vector<xla::XlaOp> BuildLogSigmoid(xla::XlaOp input);
 
 // Computes the logit function of the input.
 // If eps is given, the input is clamped between eps and 1-eps.
-xla::XlaOp BuildLogit(xla::XlaOp input, c10::optional<double> eps);
+xla::XlaOp BuildLogit(xla::XlaOp input, std::optional<double> eps);
 
 // Computes the division of input and the divisor.
 xla::XlaOp BuildDiv(xla::XlaOp input, xla::XlaOp divisor);
diff --git a/torch_xla/csrc/helpers.cpp b/torch_xla/csrc/helpers.cpp
index 349e5d4e253..0be6346004b 100644
--- a/torch_xla/csrc/helpers.cpp
+++ b/torch_xla/csrc/helpers.cpp
@@ -1009,11 +1009,11 @@ xla::StatusOr<xla::XlaComputation> XlaHelpers::WrapXlaComputation(
 
 torch::lazy::Shape XlaHelpers::ConvertXlaShapeToLazy(const xla::Shape& shape) {
   at::ScalarType scalar_type = MaybeUpcastToHostTorchType(shape.element_type());
-  c10::optional<std::vector<bool>> is_symbolic = c10::nullopt;
+  std::optional<std::vector<bool>> is_symbolic = std::nullopt;
   if (shape.is_dynamic()) {
     std::vector<bool> xla_dynamic_dimensions =
         runtime::util::ToVector<bool>(shape.dynamic_dimensions());
-    is_symbolic = c10::make_optional(xla_dynamic_dimensions);
+    is_symbolic = std::make_optional(xla_dynamic_dimensions);
   }
 
   return torch::lazy::Shape(
diff --git a/torch_xla/csrc/helpers.h b/torch_xla/csrc/helpers.h
index efe5e4d0b00..c1609cca88b 100644
--- a/torch_xla/csrc/helpers.h
+++ b/torch_xla/csrc/helpers.h
@@ -198,8 +198,8 @@ class XlaHelpers {
     return torch::lazy::ToVector<int64_t>(input);
   }
 
-  static c10::optional<int64_t> I64Optional(c10::optional<int64_t> opt) {
-    return opt ? c10::optional<int64_t>(*opt) : c10::nullopt;
+  static std::optional<int64_t> I64Optional(std::optional<int64_t> opt) {
+    return opt ? std::optional<int64_t>(*opt) : std::nullopt;
   }
 
   // Creates an XLA padding configuration from a n-dimensional padding list.
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
index fea968dc1a1..a6e3196c5d2 100644
--- a/torch_xla/csrc/init_python_bindings.cpp
+++ b/torch_xla/csrc/init_python_bindings.cpp
@@ -105,10 +105,10 @@ class PyPjRtPlugin : public runtime::PjRtPlugin {
   }
 };
 
-c10::optional<torch::lazy::BackendDevice> GetOptionalDevice(
+std::optional<torch::lazy::BackendDevice> GetOptionalDevice(
     const std::string& device_str) {
   if (device_str.empty()) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return bridge::AtenDeviceToXlaDevice(c10::Device(device_str));
 }
diff --git a/torch_xla/csrc/ir_builder.h b/torch_xla/csrc/ir_builder.h
index be0f0412065..1b7915eaa56 100644
--- a/torch_xla/csrc/ir_builder.h
+++ b/torch_xla/csrc/ir_builder.h
@@ -38,8 +38,8 @@ struct XLAIrBuilder : torch::lazy::IrBuilder {
   }
   torch::lazy::NodePtr MakeCast(const torch::lazy::Value& input0,
                                 const at::ScalarType& dtype,
-                                const c10::optional<at::ScalarType>& stype =
-                                    c10::nullopt) const override {
+                                const std::optional<at::ScalarType>& stype =
+                                    std::nullopt) const override {
     return torch::lazy::MakeNode<Cast>(input0, dtype, stype);
   }
   torch::lazy::NodePtr MakeTensorList(
diff --git a/torch_xla/csrc/ops/cast.cpp b/torch_xla/csrc/ops/cast.cpp
index f1a0a1a9072..1a021bb4e63 100644
--- a/torch_xla/csrc/ops/cast.cpp
+++ b/torch_xla/csrc/ops/cast.cpp
@@ -32,7 +32,7 @@ Cast::Cast(const torch::lazy::Value& input, xla::PrimitiveType type)
       type_(type) {}
 
 Cast::Cast(const torch::lazy::Value& input, at::ScalarType dtype,
-           c10::optional<at::ScalarType> stype)
+           std::optional<at::ScalarType> stype)
     : XlaNode(xla_cast, {input},
               NodeOutputShape(input,
                               MakeXlaPrimitiveType(dtype, /*device=*/nullptr)),
diff --git a/torch_xla/csrc/ops/cast.h b/torch_xla/csrc/ops/cast.h
index 04d02c1e5ba..b2322c80fa7 100644
--- a/torch_xla/csrc/ops/cast.h
+++ b/torch_xla/csrc/ops/cast.h
@@ -12,7 +12,7 @@ class Cast : public XlaNode {
  public:
   Cast(const torch::lazy::Value& input, xla::PrimitiveType type);
   Cast(const torch::lazy::Value& input, at::ScalarType dtype,
-       c10::optional<at::ScalarType> stype = c10::nullopt);
+       std::optional<at::ScalarType> stype = std::nullopt);
 
   std::string ToString() const override;
 
@@ -22,14 +22,14 @@ class Cast : public XlaNode {
 
   xla::PrimitiveType type() const { return type_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; };
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; };
 
-  const c10::optional<at::ScalarType>& stype() const { return stype_; };
+  const std::optional<at::ScalarType>& stype() const { return stype_; };
 
  private:
   xla::PrimitiveType type_;
-  c10::optional<at::ScalarType> dtype_;
-  c10::optional<at::ScalarType> stype_;
+  std::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> stype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/count_nonzero.h b/torch_xla/csrc/ops/count_nonzero.h
index 12e0d063be9..edfec464af3 100644
--- a/torch_xla/csrc/ops/count_nonzero.h
+++ b/torch_xla/csrc/ops/count_nonzero.h
@@ -15,7 +15,7 @@ class CountNonzero : public XlaNode {
 
   XlaOpVector Lower(LoweringContext* loctx) const override;
 
-  c10::optional<std::vector<int64_t>> dims() const { return dims_; }
+  std::optional<std::vector<int64_t>> dims() const { return dims_; }
 
  private:
   std::vector<int64_t> dims_;
diff --git a/torch_xla/csrc/ops/cumprod.cpp b/torch_xla/csrc/ops/cumprod.cpp
index 5da237c6881..b9e333259f4 100644
--- a/torch_xla/csrc/ops/cumprod.cpp
+++ b/torch_xla/csrc/ops/cumprod.cpp
@@ -16,7 +16,7 @@ namespace torch_xla {
 namespace {
 
 xla::XlaOp LowerCumProd(xla::XlaOp input, int64_t dim,
-                        c10::optional<at::ScalarType> dtype) {
+                        std::optional<at::ScalarType> dtype) {
   xla::XlaOp casted_input = CastToScalarType(input, dtype);
   const xla::Shape& input_shape = ShapeHelper::ShapeOfXlaOp(casted_input);
   xla::XlaOp init =
@@ -27,7 +27,7 @@ xla::XlaOp LowerCumProd(xla::XlaOp input, int64_t dim,
 }
 
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
-                           c10::optional<at::ScalarType> dtype) {
+                           std::optional<at::ScalarType> dtype) {
   if (dtype) {
     return xla::ShapeUtil::ChangeElementType(
         GetXlaShape(input), MakeXlaPrimitiveType(*dtype, /*device=*/nullptr));
@@ -38,7 +38,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 CumProd::CumProd(const torch::lazy::Value& input, int64_t dim,
-                 c10::optional<at::ScalarType> dtype)
+                 std::optional<at::ScalarType> dtype)
     : XlaNode(
           torch::lazy::OpKind(at::aten::cumprod), {input},
           [&]() { return NodeOutputShape(input, dtype); },
diff --git a/torch_xla/csrc/ops/cumprod.h b/torch_xla/csrc/ops/cumprod.h
index 85789490fe0..2ffb2d35811 100644
--- a/torch_xla/csrc/ops/cumprod.h
+++ b/torch_xla/csrc/ops/cumprod.h
@@ -11,7 +11,7 @@ namespace torch_xla {
 class CumProd : public XlaNode {
  public:
   CumProd(const torch::lazy::Value& input, int64_t dim,
-          c10::optional<at::ScalarType> dtype);
+          std::optional<at::ScalarType> dtype);
 
   std::string ToString() const override;
 
@@ -21,11 +21,11 @@ class CumProd : public XlaNode {
 
   int64_t dim() const { return dim_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   int64_t dim_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/cumsum.cpp b/torch_xla/csrc/ops/cumsum.cpp
index 0181cbbadd5..e0b64fc94aa 100644
--- a/torch_xla/csrc/ops/cumsum.cpp
+++ b/torch_xla/csrc/ops/cumsum.cpp
@@ -15,7 +15,7 @@ namespace torch_xla {
 namespace {
 
 xla::XlaOp LowerCumSum(xla::XlaOp input, int64_t dim,
-                       c10::optional<at::ScalarType> dtype) {
+                       std::optional<at::ScalarType> dtype) {
   xla::XlaOp casted_input = CastToScalarType(input, dtype);
   const xla::Shape& input_shape = ShapeHelper::ShapeOfXlaOp(casted_input);
   xla::XlaOp init = XlaHelpers::ScalarValue<float>(
@@ -26,7 +26,7 @@ xla::XlaOp LowerCumSum(xla::XlaOp input, int64_t dim,
 }
 
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
-                           c10::optional<at::ScalarType> dtype) {
+                           std::optional<at::ScalarType> dtype) {
   if (dtype) {
     return xla::ShapeUtil::ChangeElementType(
         GetXlaShape(input), MakeXlaPrimitiveType(*dtype, /*device=*/nullptr));
@@ -37,7 +37,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 CumSum::CumSum(const torch::lazy::Value& input, int64_t dim,
-               c10::optional<at::ScalarType> dtype)
+               std::optional<at::ScalarType> dtype)
     : XlaNode(
           torch::lazy::OpKind(at::aten::cumsum), {input},
           [&]() { return NodeOutputShape(input, dtype); },
diff --git a/torch_xla/csrc/ops/cumsum.h b/torch_xla/csrc/ops/cumsum.h
index 213c6d691fc..2282a52193c 100644
--- a/torch_xla/csrc/ops/cumsum.h
+++ b/torch_xla/csrc/ops/cumsum.h
@@ -11,7 +11,7 @@ namespace torch_xla {
 class CumSum : public XlaNode {
  public:
   CumSum(const torch::lazy::Value& input, int64_t dim,
-         c10::optional<at::ScalarType> dtype);
+         std::optional<at::ScalarType> dtype);
 
   std::string ToString() const override;
 
@@ -21,11 +21,11 @@ class CumSum : public XlaNode {
 
   int64_t dim() const { return dim_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   int64_t dim_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/index_ops.cpp b/torch_xla/csrc/ops/index_ops.cpp
index ccc3d090a56..b3a3a49480d 100644
--- a/torch_xla/csrc/ops/index_ops.cpp
+++ b/torch_xla/csrc/ops/index_ops.cpp
@@ -28,8 +28,8 @@ namespace torch_xla {
 namespace {
 
 void CheckIndexTensorTypes(
-    const c10::List<c10::optional<at::Tensor>>& indices) {
-  for (const c10::optional<at::Tensor>& tensor : indices) {
+    const c10::List<std::optional<at::Tensor>>& indices) {
+  for (const std::optional<at::Tensor>& tensor : indices) {
     if (tensor.has_value() && tensor->defined()) {
       at::ScalarType scalar_type = tensor->scalar_type();
       if (scalar_type != at::kLong && scalar_type != at::kInt &&
@@ -47,9 +47,9 @@ void CheckIndexTensorTypes(
 // This is a version of at::native::expandByteTensors with style adjustments.
 std::vector<at::Tensor> ExpandByteTensors(
     const at::Tensor& self,
-    const c10::List<c10::optional<at::Tensor>>& indices) {
+    const c10::List<std::optional<at::Tensor>>& indices) {
   std::vector<at::Tensor> result;
-  for (const c10::optional<at::Tensor>& index : indices) {
+  for (const std::optional<at::Tensor>& index : indices) {
     if (index.has_value() && (index->scalar_type() == at::kByte ||
                               index->scalar_type() == at::kBool)) {
       // The sizes of the ByteTensor mask must match the sizes of the
@@ -161,7 +161,7 @@ std::vector<XLATensorPtr> WrapIndicesOnce(
       wrapped_dim_index = XLATensor::Create(
           dim_index->GetIrValue() +
               XLAGraphExecutor::Get()->GetIrValueForScalar(
-                  dim_size, dim_index->shape(), sym_int_elements, c10::nullopt,
+                  dim_size, dim_index->shape(), sym_int_elements, std::nullopt,
                   base->GetDevice()),
           base->GetDevice());
     }
@@ -256,7 +256,7 @@ torch::lazy::NodePtr IndexCopyOp(const torch::lazy::Value& buffer, int64_t dim,
 
 CanonicalIndexInfo GetCanonicalIndexInfo(
     const at::Tensor& base,
-    const c10::List<c10::optional<at::Tensor>>& orig_indices) {
+    const c10::List<std::optional<at::Tensor>>& orig_indices) {
   CheckIndexTensorTypes(orig_indices);
   // First expand ByteTensor (boolean masks) into 1 or more LongTensors, then
   // broadcast all index tensors together.
diff --git a/torch_xla/csrc/ops/index_ops.h b/torch_xla/csrc/ops/index_ops.h
index e62b2e7e8fb..0be63ed0455 100644
--- a/torch_xla/csrc/ops/index_ops.h
+++ b/torch_xla/csrc/ops/index_ops.h
@@ -49,7 +49,7 @@ struct CanonicalIndexInfo {
 // are reordered to be consistent with this reordering.
 CanonicalIndexInfo GetCanonicalIndexInfo(
     const at::Tensor& base,
-    const c10::List<c10::optional<at::Tensor>>& orig_indices);
+    const c10::List<std::optional<at::Tensor>>& orig_indices);
 
 // Expands a rank <= 1 tensor to rank 1, if necessary.
 torch::lazy::Value EnsureRank1(const torch::lazy::Value& index);
diff --git a/torch_xla/csrc/ops/log_softmax.cpp b/torch_xla/csrc/ops/log_softmax.cpp
index c0f9d43df6c..0dc8672ad2b 100644
--- a/torch_xla/csrc/ops/log_softmax.cpp
+++ b/torch_xla/csrc/ops/log_softmax.cpp
@@ -12,13 +12,13 @@ namespace torch_xla {
 namespace {
 
 xla::XlaOp LowerLogSoftmax(xla::XlaOp input, int64_t dim,
-                           const c10::optional<at::ScalarType>& dtype) {
+                           const std::optional<at::ScalarType>& dtype) {
   xla::XlaOp result = BuildLogSoftmax(input, dim);
   return CastToScalarType(result, dtype);
 }
 
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
-                           const c10::optional<at::ScalarType>& dtype) {
+                           const std::optional<at::ScalarType>& dtype) {
   if (dtype) {
     return xla::ShapeUtil::ChangeElementType(
         GetXlaShape(input), MakeXlaPrimitiveType(*dtype, /*device=*/nullptr));
@@ -29,7 +29,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 LogSoftmax::LogSoftmax(const torch::lazy::Value& input, int64_t dim,
-                       c10::optional<at::ScalarType> dtype,
+                       std::optional<at::ScalarType> dtype,
                        std::vector<torch::lazy::Shape>&& shapes)
     : XlaNode(
           torch::lazy::OpKind(at::aten::log_softmax), {input},
diff --git a/torch_xla/csrc/ops/log_softmax.h b/torch_xla/csrc/ops/log_softmax.h
index 89d8b5db5d4..33a0cb522a6 100644
--- a/torch_xla/csrc/ops/log_softmax.h
+++ b/torch_xla/csrc/ops/log_softmax.h
@@ -12,7 +12,7 @@ namespace torch_xla {
 class LogSoftmax : public XlaNode {
  public:
   LogSoftmax(const torch::lazy::Value& input, int64_t dim,
-             c10::optional<at::ScalarType> dtype,
+             std::optional<at::ScalarType> dtype,
              std::vector<torch::lazy::Shape>&& shapes);
 
   XlaOpVector Lower(LoweringContext* loctx) const override;
@@ -21,12 +21,12 @@ class LogSoftmax : public XlaNode {
 
   int64_t dim() const { return dim_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   // The dimension along which the result is computed.
   int64_t dim_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/mean.cpp b/torch_xla/csrc/ops/mean.cpp
index 2689651848f..32b8f674028 100644
--- a/torch_xla/csrc/ops/mean.cpp
+++ b/torch_xla/csrc/ops/mean.cpp
@@ -15,7 +15,7 @@ namespace {
 
 xla::XlaOp LowerMean(xla::XlaOp input, const std::vector<int64_t>& dimensions,
                      bool keep_reduced_dimensions,
-                     const c10::optional<at::ScalarType>& dtype) {
+                     const std::optional<at::ScalarType>& dtype) {
   xla::XlaOp result = BuildMean(input, dimensions, keep_reduced_dimensions);
   return dtype ? xla::ConvertElementType(
                      result, MakeXlaPrimitiveType(*dtype, /*device=*/nullptr))
@@ -25,7 +25,7 @@ xla::XlaOp LowerMean(xla::XlaOp input, const std::vector<int64_t>& dimensions,
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
                            const std::vector<int64_t>& dimensions,
                            bool keep_reduced_dimensions,
-                           const c10::optional<at::ScalarType>& dtype) {
+                           const std::optional<at::ScalarType>& dtype) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     return LowerMean(operands[0], dimensions, keep_reduced_dimensions, dtype);
@@ -36,7 +36,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 Mean::Mean(const torch::lazy::Value& input, std::vector<int64_t> dimensions,
-           bool keep_reduced_dimensions, c10::optional<at::ScalarType> dtype)
+           bool keep_reduced_dimensions, std::optional<at::ScalarType> dtype)
     : XlaNode(
           torch::lazy::OpKind(at::aten::mean), {input},
           [&]() {
diff --git a/torch_xla/csrc/ops/mean.h b/torch_xla/csrc/ops/mean.h
index 2d73b7f1b54..cdf69ad86e2 100644
--- a/torch_xla/csrc/ops/mean.h
+++ b/torch_xla/csrc/ops/mean.h
@@ -14,7 +14,7 @@ namespace torch_xla {
 class Mean : public XlaNode {
  public:
   Mean(const torch::lazy::Value& input, std::vector<int64_t> dimensions,
-       bool keep_reduced_dimensions, c10::optional<at::ScalarType> dtype);
+       bool keep_reduced_dimensions, std::optional<at::ScalarType> dtype);
 
   std::string ToString() const override;
 
@@ -26,12 +26,12 @@ class Mean : public XlaNode {
 
   bool keep_reduced_dimensions() const { return keep_reduced_dimensions_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   std::vector<int64_t> dimensions_;
   bool keep_reduced_dimensions_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/native_dropout.cpp b/torch_xla/csrc/ops/native_dropout.cpp
index e56a1beb709..6b4b757fc6f 100644
--- a/torch_xla/csrc/ops/native_dropout.cpp
+++ b/torch_xla/csrc/ops/native_dropout.cpp
@@ -16,7 +16,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input) {
 
 NativeDropout::NativeDropout(const torch::lazy::Value& input,
                              const torch::lazy::Value& seed, float p,
-                             c10::optional<bool> train)
+                             std::optional<bool> train)
     : XlaNode(
           torch::lazy::OpKind(at::aten::native_dropout), {input, seed},
           [&]() { return NodeOutputShape(input); }, 2,
diff --git a/torch_xla/csrc/ops/native_dropout.h b/torch_xla/csrc/ops/native_dropout.h
index edfa0c2fc98..6bac3c859fa 100644
--- a/torch_xla/csrc/ops/native_dropout.h
+++ b/torch_xla/csrc/ops/native_dropout.h
@@ -11,7 +11,7 @@ namespace torch_xla {
 class NativeDropout : public XlaNode {
  public:
   NativeDropout(const torch::lazy::Value& input, const torch::lazy::Value& seed,
-                float p, c10::optional<bool> train);
+                float p, std::optional<bool> train);
 
   torch::lazy::NodePtr Clone(torch::lazy::OpList operands) const override;
 
@@ -19,7 +19,7 @@ class NativeDropout : public XlaNode {
 
  private:
   float p_;
-  c10::optional<bool> train_;
+  std::optional<bool> train_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/ops.cpp b/torch_xla/csrc/ops/ops.cpp
index af4daf28648..a7d655c7b67 100644
--- a/torch_xla/csrc/ops/ops.cpp
+++ b/torch_xla/csrc/ops/ops.cpp
@@ -105,7 +105,7 @@ torch::lazy::NodePtr LogBase(const torch::lazy::Value& input,
 }
 
 torch::lazy::NodePtr Logit(const torch::lazy::Value& input,
-                           c10::optional<double> eps) {
+                           std::optional<double> eps) {
   auto lower_fn = [eps](const XlaNode& node,
                         LoweringContext* loctx) -> XlaOpVector {
     xla::XlaOp xla_input = loctx->GetOutputOp(node.operand(0));
@@ -526,8 +526,8 @@ torch::lazy::NodePtr BroadcastTensors(
 }
 
 torch::lazy::NodePtr Norm(const torch::lazy::Value& input,
-                          const c10::optional<at::Scalar>& p,
-                          c10::optional<at::ScalarType> dtype,
+                          const std::optional<at::Scalar>& p,
+                          std::optional<at::ScalarType> dtype,
                           absl::Span<const int64_t> dims, bool keepdim) {
   torch::lazy::ScopePusher ir_scope(at::aten::norm.toQualString());
   auto dimensions = torch::lazy::ToVector<int64_t>(dims);
@@ -568,8 +568,8 @@ torch::lazy::NodePtr Norm(const torch::lazy::Value& input,
 }
 
 torch::lazy::NodePtr Pdist_forward(const torch::lazy::Value& input,
-                                   const c10::optional<at::Scalar>& p,
-                                   c10::optional<at::ScalarType> dtype) {
+                                   const std::optional<at::Scalar>& p,
+                                   std::optional<at::ScalarType> dtype) {
   // pdist(x, p) is equal to norm(x[:, None]-x, dim=2, p) and we only take the
   // upper triangle without diagonal line.
   auto lower_fn = [=](const XlaNode& node,
@@ -625,7 +625,7 @@ torch::lazy::NodePtr LinalgVectorNorm(const torch::lazy::Value& input,
                                       const at::Scalar& ord,
                                       std::vector<int64_t> dimensions,
                                       bool keepdim,
-                                      c10::optional<at::ScalarType> dtype) {
+                                      std::optional<at::ScalarType> dtype) {
   torch::lazy::ScopePusher ir_scope(at::aten::norm.toQualString());
   double ord_value = ord.to<double>();
   auto input_shape = GetXlaShape(input);
diff --git a/torch_xla/csrc/ops/ops.h b/torch_xla/csrc/ops/ops.h
index 5d423b3b1ee..58e153ab12f 100644
--- a/torch_xla/csrc/ops/ops.h
+++ b/torch_xla/csrc/ops/ops.h
@@ -92,7 +92,7 @@ torch::lazy::NodePtr Exp(const torch::lazy::Value& input);
 torch::lazy::NodePtr Log(const torch::lazy::Value& input);
 
 torch::lazy::NodePtr Logit(const torch::lazy::Value& input,
-                           c10::optional<double> eps);
+                           std::optional<double> eps);
 
 torch::lazy::NodePtr LogBase(const torch::lazy::Value& input,
                              torch::lazy::OpKind op, double base);
@@ -169,13 +169,13 @@ torch::lazy::NodePtr BroadcastTensors(
     c10::ArrayRef<torch::lazy::Value> tensors);
 
 torch::lazy::NodePtr Norm(const torch::lazy::Value& input,
-                          const c10::optional<at::Scalar>& p,
-                          c10::optional<at::ScalarType> dtype,
+                          const std::optional<at::Scalar>& p,
+                          std::optional<at::ScalarType> dtype,
                           absl::Span<const int64_t> dims, bool keepdim);
 
 torch::lazy::NodePtr Pdist_forward(const torch::lazy::Value& input,
-                                   const c10::optional<at::Scalar>& p,
-                                   c10::optional<at::ScalarType> dtype);
+                                   const std::optional<at::Scalar>& p,
+                                   std::optional<at::ScalarType> dtype);
 
 torch::lazy::NodePtr PixelShuffle(const torch::lazy::Value& input,
                                   int64_t upscale_factor);
@@ -184,7 +184,7 @@ torch::lazy::NodePtr LinalgVectorNorm(const torch::lazy::Value& input,
                                       const at::Scalar& ord,
                                       std::vector<int64_t> dimensions,
                                       bool keepdim,
-                                      c10::optional<at::ScalarType> dtype);
+                                      std::optional<at::ScalarType> dtype);
 
 torch::lazy::NodePtr Identity(int64_t lines, int64_t cols,
                               xla::PrimitiveType element_type);
diff --git a/torch_xla/csrc/ops/ops_xla_shape_fn.cpp b/torch_xla/csrc/ops/ops_xla_shape_fn.cpp
index 9de0afe6b01..aa067efb93e 100755
--- a/torch_xla/csrc/ops/ops_xla_shape_fn.cpp
+++ b/torch_xla/csrc/ops/ops_xla_shape_fn.cpp
@@ -16,7 +16,7 @@ namespace {
 template <typename T>
 std::vector<T> GetValuesVectorWithOptional(
     absl::Span<const T> values,
-    absl::Span<const c10::optional<T>* const> opt_values) {
+    absl::Span<const std::optional<T>* const> opt_values) {
   std::vector<T> result(values.begin(), values.end());
   for (auto opt : opt_values) {
     if (*opt) {
@@ -192,7 +192,7 @@ xla::Shape AnyDimOutputShape(const torch::lazy::Value& input, int64_t dim,
 }
 
 xla::Shape ArgmaxOutputShape(const torch::lazy::Value& input,
-                             c10::optional<int64_t> dim, bool keepdim) {
+                             std::optional<int64_t> dim, bool keepdim) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     if (dim.has_value()) {
@@ -208,7 +208,7 @@ xla::Shape ArgmaxOutputShape(const torch::lazy::Value& input,
 }
 
 xla::Shape ArgminOutputShape(const torch::lazy::Value& input,
-                             c10::optional<int64_t> dim, bool keepdim) {
+                             std::optional<int64_t> dim, bool keepdim) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     if (dim.has_value()) {
@@ -295,7 +295,7 @@ xla::Shape BaddbmmOutputShape(const torch::lazy::Value& self,
 
 xla::Shape BinaryCrossEntropyOutputShape(
     const torch::lazy::Value& input, const torch::lazy::Value& target,
-    const c10::optional<torch::lazy::Value>& weight, int64_t reduction) {
+    const std::optional<torch::lazy::Value>& weight, int64_t reduction) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     absl::optional<xla::XlaOp> weight;
@@ -316,7 +316,7 @@ xla::Shape BinaryCrossEntropyOutputShape(
 xla::Shape BinaryCrossEntropyBackwardOutputShape(
     const torch::lazy::Value& grad_output, const torch::lazy::Value& input,
     const torch::lazy::Value& target,
-    const c10::optional<torch::lazy::Value>& weight, int64_t reduction) {
+    const std::optional<torch::lazy::Value>& weight, int64_t reduction) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     absl::optional<xla::XlaOp> weight;
@@ -371,8 +371,8 @@ xla::Shape CholeskyOutputShape(const torch::lazy::Value& input,
 
 xla::Shape ClampTensorOutputShape(
     const torch::lazy::Value& input,
-    const c10::optional<torch::lazy::Value>& min,
-    const c10::optional<torch::lazy::Value>& max) {
+    const std::optional<torch::lazy::Value>& min,
+    const std::optional<torch::lazy::Value>& max) {
   // This shape function works in a bit of an odd/hacky way.
   // If operands.size() > 1, operands[1] can be either min or
   // max since they are both optional values. But in this code,
diff --git a/torch_xla/csrc/ops/ops_xla_shape_fn.h b/torch_xla/csrc/ops/ops_xla_shape_fn.h
index 639edc1679b..a040529833a 100644
--- a/torch_xla/csrc/ops/ops_xla_shape_fn.h
+++ b/torch_xla/csrc/ops/ops_xla_shape_fn.h
@@ -46,10 +46,10 @@ xla::Shape AminOutputShape(const torch::lazy::Value& input,
                            absl::Span<const int64_t> dim, bool keepdim);
 
 xla::Shape ArgmaxOutputShape(const torch::lazy::Value& input,
-                             c10::optional<int64_t> dim, bool keepdim);
+                             std::optional<int64_t> dim, bool keepdim);
 
 xla::Shape ArgminOutputShape(const torch::lazy::Value& input,
-                             c10::optional<int64_t> dim, bool keepdim);
+                             std::optional<int64_t> dim, bool keepdim);
 
 xla::Shape AnyOutputShape(const torch::lazy::Value& input);
 
@@ -75,12 +75,12 @@ xla::Shape BaddbmmOutputShape(const torch::lazy::Value& self,
 
 xla::Shape BinaryCrossEntropyOutputShape(
     const torch::lazy::Value& input, const torch::lazy::Value& target,
-    const c10::optional<torch::lazy::Value>& weight, int64_t reduction);
+    const std::optional<torch::lazy::Value>& weight, int64_t reduction);
 
 xla::Shape BinaryCrossEntropyBackwardOutputShape(
     const torch::lazy::Value& grad_output, const torch::lazy::Value& input,
     const torch::lazy::Value& target,
-    const c10::optional<torch::lazy::Value>& weight, int64_t reduction);
+    const std::optional<torch::lazy::Value>& weight, int64_t reduction);
 
 xla::Shape BitwiseAndTensorOutputShape(const torch::lazy::Value& input,
                                        const torch::lazy::Value& other);
@@ -99,8 +99,8 @@ xla::Shape CholeskyOutputShape(const torch::lazy::Value& input,
                                const bool upper);
 
 xla::Shape ClampTensorOutputShape(const torch::lazy::Value& input,
-                                  const c10::optional<torch::lazy::Value>& min,
-                                  const c10::optional<torch::lazy::Value>& max);
+                                  const std::optional<torch::lazy::Value>& min,
+                                  const std::optional<torch::lazy::Value>& max);
 
 xla::Shape ClampMaxTensorOutputShape(const torch::lazy::Value& input,
                                      const torch::lazy::Value& target);
diff --git a/torch_xla/csrc/ops/prod.cpp b/torch_xla/csrc/ops/prod.cpp
index a0176b93873..338359e2988 100644
--- a/torch_xla/csrc/ops/prod.cpp
+++ b/torch_xla/csrc/ops/prod.cpp
@@ -16,7 +16,7 @@ namespace {
 
 xla::XlaOp LowerProd(xla::XlaOp input, const std::vector<int64_t>& dimensions,
                      bool keep_reduced_dimensions,
-                     c10::optional<at::ScalarType> dtype) {
+                     std::optional<at::ScalarType> dtype) {
   xla::XlaOp casted_input;
   if (dtype) {
     casted_input = ConvertTo(input, XlaHelpers::TypeOfXlaOp(input),
@@ -30,7 +30,7 @@ xla::XlaOp LowerProd(xla::XlaOp input, const std::vector<int64_t>& dimensions,
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
                            std::vector<int64_t>& dimensions,
                            bool keep_reduced_dimensions,
-                           c10::optional<at::ScalarType> dtype) {
+                           std::optional<at::ScalarType> dtype) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     return LowerProd(operands[0], dimensions, keep_reduced_dimensions, dtype);
@@ -41,7 +41,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 Prod::Prod(const torch::lazy::Value& input, std::vector<int64_t> dimensions,
-           bool keep_reduced_dimensions, c10::optional<at::ScalarType> dtype)
+           bool keep_reduced_dimensions, std::optional<at::ScalarType> dtype)
     : XlaNode(
           torch::lazy::OpKind(at::aten::prod), {input},
           [&]() {
diff --git a/torch_xla/csrc/ops/prod.h b/torch_xla/csrc/ops/prod.h
index 85e90c06e48..c22e2ec66a7 100644
--- a/torch_xla/csrc/ops/prod.h
+++ b/torch_xla/csrc/ops/prod.h
@@ -12,7 +12,7 @@ namespace torch_xla {
 class Prod : public XlaNode {
  public:
   Prod(const torch::lazy::Value& input, std::vector<int64_t> dimensions,
-       bool keep_reduced_dimensions, c10::optional<at::ScalarType> dtype);
+       bool keep_reduced_dimensions, std::optional<at::ScalarType> dtype);
 
   std::string ToString() const override;
 
@@ -24,12 +24,12 @@ class Prod : public XlaNode {
 
   bool keep_reduced_dimensions() const { return keep_reduced_dimensions_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   std::vector<int64_t> dimensions_;
   bool keep_reduced_dimensions_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/softmax.cpp b/torch_xla/csrc/ops/softmax.cpp
index e8df927ebc1..241bab41ace 100644
--- a/torch_xla/csrc/ops/softmax.cpp
+++ b/torch_xla/csrc/ops/softmax.cpp
@@ -12,13 +12,13 @@ namespace torch_xla {
 namespace {
 
 xla::XlaOp LowerSoftmax(xla::XlaOp input, int64_t dim,
-                        const c10::optional<at::ScalarType>& dtype) {
+                        const std::optional<at::ScalarType>& dtype) {
   xla::XlaOp result = BuildSoftmax(input, dim);
   return CastToScalarType(result, dtype);
 }
 
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
-                           const c10::optional<at::ScalarType>& dtype) {
+                           const std::optional<at::ScalarType>& dtype) {
   if (dtype) {
     return xla::ShapeUtil::ChangeElementType(
         GetXlaShape(input), MakeXlaPrimitiveType(*dtype, /*device=*/nullptr));
@@ -29,7 +29,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 Softmax::Softmax(const torch::lazy::Value& input, int64_t dim,
-                 c10::optional<at::ScalarType> dtype)
+                 std::optional<at::ScalarType> dtype)
     : XlaNode(
           torch::lazy::OpKind(at::aten::softmax), {input},
           [&]() { return NodeOutputShape(input, dtype); },
diff --git a/torch_xla/csrc/ops/softmax.h b/torch_xla/csrc/ops/softmax.h
index 5b0ebd072d6..a9fa951fd9d 100644
--- a/torch_xla/csrc/ops/softmax.h
+++ b/torch_xla/csrc/ops/softmax.h
@@ -11,7 +11,7 @@ namespace torch_xla {
 class Softmax : public XlaNode {
  public:
   Softmax(const torch::lazy::Value& input, int64_t dim,
-          c10::optional<at::ScalarType> dtype);
+          std::optional<at::ScalarType> dtype);
 
   torch::lazy::NodePtr Clone(torch::lazy::OpList operands) const override;
 
@@ -21,11 +21,11 @@ class Softmax : public XlaNode {
 
   int64_t dim() const { return dim_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   int64_t dim_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/sum.cpp b/torch_xla/csrc/ops/sum.cpp
index b7c1d2213c8..f24f62da7e3 100644
--- a/torch_xla/csrc/ops/sum.cpp
+++ b/torch_xla/csrc/ops/sum.cpp
@@ -16,7 +16,7 @@ namespace {
 
 xla::XlaOp LowerSum(xla::XlaOp input, absl::Span<const int64_t> dimensions,
                     bool keep_reduced_dimensions,
-                    c10::optional<at::ScalarType> dtype) {
+                    std::optional<at::ScalarType> dtype) {
   return BuildSum(CastToScalarType(input, dtype), dimensions,
                   keep_reduced_dimensions);
 }
@@ -24,7 +24,7 @@ xla::XlaOp LowerSum(xla::XlaOp input, absl::Span<const int64_t> dimensions,
 xla::Shape NodeOutputShape(const torch::lazy::Value& input,
                            absl::Span<const int64_t> dimensions,
                            bool keep_reduced_dimensions,
-                           c10::optional<at::ScalarType> dtype) {
+                           std::optional<at::ScalarType> dtype) {
   auto lower_for_shape_fn =
       [&](absl::Span<const xla::XlaOp> operands) -> xla::XlaOp {
     return LowerSum(operands[0], dimensions, keep_reduced_dimensions, dtype);
@@ -35,7 +35,7 @@ xla::Shape NodeOutputShape(const torch::lazy::Value& input,
 }  // namespace
 
 Sum::Sum(const torch::lazy::Value& input, std::vector<int64_t> dimensions,
-         bool keep_reduced_dimensions, c10::optional<at::ScalarType> dtype)
+         bool keep_reduced_dimensions, std::optional<at::ScalarType> dtype)
     : XlaNode(
           torch::lazy::OpKind(at::aten::sum), {input},
           [&]() {
diff --git a/torch_xla/csrc/ops/sum.h b/torch_xla/csrc/ops/sum.h
index f92ebc91e44..d66960342b4 100644
--- a/torch_xla/csrc/ops/sum.h
+++ b/torch_xla/csrc/ops/sum.h
@@ -12,7 +12,7 @@ namespace torch_xla {
 class Sum : public XlaNode {
  public:
   Sum(const torch::lazy::Value& input, std::vector<int64_t> dimensions,
-      bool keep_reduced_dimensions, c10::optional<at::ScalarType> dtype);
+      bool keep_reduced_dimensions, std::optional<at::ScalarType> dtype);
 
   std::string ToString() const override;
 
@@ -24,12 +24,12 @@ class Sum : public XlaNode {
 
   bool keep_reduced_dimensions() const { return keep_reduced_dimensions_; }
 
-  const c10::optional<at::ScalarType>& dtype() const { return dtype_; }
+  const std::optional<at::ScalarType>& dtype() const { return dtype_; }
 
  private:
   std::vector<int64_t> dimensions_;
   bool keep_reduced_dimensions_;
-  c10::optional<at::ScalarType> dtype_;
+  std::optional<at::ScalarType> dtype_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/tensor.cpp b/torch_xla/csrc/tensor.cpp
index 3444e9ff6d9..498f3b93536 100644
--- a/torch_xla/csrc/tensor.cpp
+++ b/torch_xla/csrc/tensor.cpp
@@ -71,7 +71,7 @@ XLATensorPtr XLATensor::Create(const at::Tensor& tensor,
 
 XLATensorPtr XLATensor::Create(
     torch::lazy::BackendDataPtr handle,
-    c10::optional<at::ScalarType> logical_element_type) {
+    std::optional<at::ScalarType> logical_element_type) {
   XLATensorPtr xtensor = c10::make_intrusive<XLATensor>(
       XLATensor(std::move(handle), logical_element_type));
   XLAGraphExecutor::Get()->RegisterTensor(xtensor->data());
@@ -80,7 +80,7 @@ XLATensorPtr XLATensor::Create(
 
 XLATensorPtr XLATensor::Create(
     torch::lazy::Value ir_value, const torch::lazy::BackendDevice& device,
-    c10::optional<at::ScalarType> logical_element_type) {
+    std::optional<at::ScalarType> logical_element_type) {
   XLATensorPtr xtensor = c10::make_intrusive<XLATensor>(
       XLATensor(std::move(ir_value), device, logical_element_type));
   XLAGraphExecutor::Get()->RegisterTensor(xtensor->data());
@@ -93,7 +93,7 @@ XLATensorPtr XLATensor::Create(
 
 XLATensorPtr XLATensor::Create(
     std::shared_ptr<View> view, const torch::lazy::BackendDevice& device,
-    c10::optional<at::ScalarType> logical_element_type) {
+    std::optional<at::ScalarType> logical_element_type) {
   XLATensorPtr xtensor = c10::make_intrusive<XLATensor>(
       XLATensor(std::move(view), device, logical_element_type));
   XLAGraphExecutor::Get()->RegisterTensor(xtensor->data());
@@ -109,7 +109,7 @@ XLATensor::XLATensor(const at::Tensor& tensor,
     : XLATensor(std::make_shared<Data>(tensor, device)) {}
 
 XLATensor::XLATensor(torch::lazy::BackendDataPtr handle,
-                     c10::optional<at::ScalarType> logical_element_type)
+                     std::optional<at::ScalarType> logical_element_type)
     : XLATensor(std::make_shared<Data>(handle, handle->device(),
                                        logical_element_type)) {
   // if data is sharded we need to carry the sharding spec over.
@@ -123,7 +123,7 @@ XLATensor::XLATensor(torch::lazy::BackendDataPtr handle,
 
 XLATensor::XLATensor(torch::lazy::Value ir_value,
                      const torch::lazy::BackendDevice& device,
-                     c10::optional<at::ScalarType> logical_element_type)
+                     std::optional<at::ScalarType> logical_element_type)
     : XLATensor(std::make_shared<Data>(std::move(ir_value), device,
                                        logical_element_type)) {
   // Preserve sharding if a new tensor is created from a sharded IR node.
@@ -141,7 +141,7 @@ XLATensor::XLATensor(torch::lazy::Value ir_value,
 
 XLATensor::XLATensor(std::shared_ptr<View> view,
                      const torch::lazy::BackendDevice& device,
-                     c10::optional<at::ScalarType> logical_element_type)
+                     std::optional<at::ScalarType> logical_element_type)
     : XLATensor(std::make_shared<Data>(std::move(view), device,
                                        logical_element_type)) {}
 
@@ -171,7 +171,7 @@ at::ScalarType XLATensor::dtype() const {
              : MaybeUpcastToHostTorchType(shape().get().element_type());
 }
 
-c10::optional<at::ScalarType> XLATensor::dtype_optional() const {
+std::optional<at::ScalarType> XLATensor::dtype_optional() const {
   return data()->logical_element_type;
 }
 
@@ -322,14 +322,14 @@ void XLATensor::SetXlaData(torch::lazy::BackendDataPtr handle, bool sync) {
   AssignIrValue(torch::lazy::Value());
   if (sync) {
     data()->view = nullptr;
-    data()->tensor_data = c10::nullopt;
+    data()->tensor_data = std::nullopt;
   }
   data()->is_cloned = false;
 }
 
 void XLATensor::SetIrValue(torch::lazy::Value ir_value, bool inplace) {
   data()->handle = nullptr;
-  data()->tensor_data = c10::nullopt;
+  data()->tensor_data = std::nullopt;
   if (data()->view != nullptr && inplace) {
     // If we have an active view, SetIrValue() happens, and we are
     // within an in-place execution context, we need to update the view's
@@ -383,10 +383,10 @@ torch::lazy::Value XLATensor::GetIrValue() const {
     AssignIrValue(CreateTensorNode(handle, /*read_only=*/false));
     return data()->ir_value;
   }
-  c10::optional<at::Tensor> tensor_data = CurrentTensorData();
+  std::optional<at::Tensor> tensor_data = CurrentTensorData();
   XLA_CHECK(tensor_data);
   AssignIrValue(GetIrValueForTensor(*tensor_data, GetDevice()));
-  data()->tensor_data = c10::nullopt;
+  data()->tensor_data = std::nullopt;
   return data()->ir_value;
 }
 
@@ -397,9 +397,9 @@ torch::lazy::Value XLATensor::CurrentIrValue() const {
   return data()->ir_value;
 }
 
-c10::optional<at::Tensor> XLATensor::CurrentTensorData() const {
+std::optional<at::Tensor> XLATensor::CurrentTensorData() const {
   if (data()->view != nullptr && !data()->view->IsUpToDate()) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return data()->tensor_data;
 }
@@ -427,7 +427,7 @@ View::IrNode XLATensor::GetViewUpdate(const std::shared_ptr<View>& view) const {
   View::IrNode ir_value_updated = view->GetViewIrNode();
   if (ir_value_updated.updated) {
     data()->handle = nullptr;
-    data()->tensor_data = c10::nullopt;
+    data()->tensor_data = std::nullopt;
   }
   return ir_value_updated;
 }
@@ -492,7 +492,7 @@ XLATensorPtr XLATensor::CreateViewTensor(ViewInfo view_info) const {
 
 at::Tensor XLATensor::ToTensor(bool detached) {
   at::Tensor tensor;
-  c10::optional<at::Tensor> tensor_data = CurrentTensorData();
+  std::optional<at::Tensor> tensor_data = CurrentTensorData();
   if (!tensor_data) {
     XLAGraphExecutor::Get()->DeviceBarrier(GetDevice());
     // The GetXlaData() call will trigger an ApplyPendingGraph() if an IR
@@ -510,7 +510,7 @@ at::Tensor XLATensor::ToTensor(bool detached) {
           data()->view != nullptr) {
         // If we have other authoritive sources, just drop our reference and
         // transfer it to the caller.
-        data()->tensor_data = c10::nullopt;
+        data()->tensor_data = std::nullopt;
       } else {
         // Otherwise we need to make a copy to prevent the caller changing our
         // version.
@@ -530,7 +530,7 @@ void XLATensor::ShallowCopyTo(XLATensorPtr dest) const {
 }
 
 void XLATensor::SetScalarType(
-    c10::optional<at::ScalarType> logical_element_type) {
+    std::optional<at::ScalarType> logical_element_type) {
   data()->logical_element_type = logical_element_type;
 }
 
@@ -586,7 +586,7 @@ std::vector<XLATensorPtr> XLATensor::MakeOutputTensors(
       tensors.push_back(CreateFrom(torch::lazy::Value(node, i)));
     } else {
       tensors.push_back(CreateFrom(torch::lazy::Value(node, i),
-                                   /*logical_element_type=*/c10::nullopt));
+                                   /*logical_element_type=*/std::nullopt));
     }
   }
   return tensors;
@@ -600,7 +600,7 @@ XLATensorPtr XLATensor::CopyTensorToDevice(
 
 torch::lazy::Value XLATensor::MaybeCastIrValue(
     torch::lazy::Value ir_value, const torch::lazy::BackendDevice& device,
-    c10::optional<at::ScalarType> logical_element_type) const {
+    std::optional<at::ScalarType> logical_element_type) const {
   if (!logical_element_type) {
     logical_element_type = dtype_optional();
   }
@@ -613,13 +613,13 @@ torch::lazy::Value XLATensor::MaybeCastIrValue(
 
 XLATensorPtr XLATensor::CreateFrom(torch::lazy::Value ir_value) const {
   ir_value = MaybeCastIrValue(std::move(ir_value), GetDevice(),
-                              /*logical_element_type=*/c10::nullopt);
+                              /*logical_element_type=*/std::nullopt);
   return Create(std::move(ir_value), GetDevice(), dtype_optional());
 }
 
 XLATensorPtr XLATensor::CreateFrom(
     torch::lazy::Value ir_value,
-    c10::optional<at::ScalarType> logical_element_type_opt) const {
+    std::optional<at::ScalarType> logical_element_type_opt) const {
   ir_value = MaybeCastIrValue(std::move(ir_value), GetDevice(),
                               logical_element_type_opt);
   return Create(std::move(ir_value), GetDevice(), logical_element_type_opt);
diff --git a/torch_xla/csrc/tensor.h b/torch_xla/csrc/tensor.h
index 7a3e0b57dd8..101c6c54b75 100644
--- a/torch_xla/csrc/tensor.h
+++ b/torch_xla/csrc/tensor.h
@@ -100,7 +100,7 @@ class XLATensor : public torch::lazy::LazyTensor {
   struct Data : public torch::lazy::LazyTensor::Data {
     Data(torch::lazy::BackendDataPtr handle,
          const torch::lazy::BackendDevice& device,
-         c10::optional<at::ScalarType> logical_element_type,
+         std::optional<at::ScalarType> logical_element_type,
          ShardingSpecPtr sharding = nullptr)
         : torch::lazy::LazyTensor::Data(handle, device),
           logical_element_type(logical_element_type),
@@ -108,7 +108,7 @@ class XLATensor : public torch::lazy::LazyTensor {
       alias_id = unique_id;
     }
     Data(torch::lazy::Value ir_value, const torch::lazy::BackendDevice& device,
-         c10::optional<at::ScalarType> logical_element_type,
+         std::optional<at::ScalarType> logical_element_type,
          ShardingSpecPtr sharding = nullptr)
         : torch::lazy::LazyTensor::Data(ir_value, device),
           logical_element_type(logical_element_type),
@@ -123,7 +123,7 @@ class XLATensor : public torch::lazy::LazyTensor {
       alias_id = unique_id;
     }
     Data(std::shared_ptr<View> view, const torch::lazy::BackendDevice& device,
-         c10::optional<at::ScalarType> logical_element_type,
+         std::optional<at::ScalarType> logical_element_type,
          ShardingSpecPtr sharding = nullptr)
         : torch::lazy::LazyTensor::Data(device),
           view(std::move(view)),
@@ -136,7 +136,7 @@ class XLATensor : public torch::lazy::LazyTensor {
 
     std::shared_ptr<View> view;
     // TODO: remove this in favor of torch::lazy::Shape within ir_value.
-    c10::optional<at::ScalarType> logical_element_type;
+    std::optional<at::ScalarType> logical_element_type;
     // The user provided sharding spec is attached to `XLATensor::Data`
     // and all sharding look-up should refer to it as source of truth.
     // A copy of the sharding spec is attached to the IR node via
@@ -153,10 +153,10 @@ class XLATensor : public torch::lazy::LazyTensor {
                              const torch::lazy::BackendDevice& device);
   static XLATensorPtr Create(
       torch::lazy::BackendDataPtr handle,
-      c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+      std::optional<at::ScalarType> logical_element_type = std::nullopt);
   static XLATensorPtr Create(
       torch::lazy::Value ir_value, const torch::lazy::BackendDevice& device,
-      c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+      std::optional<at::ScalarType> logical_element_type = std::nullopt);
   static XLATensorPtr Create(std::shared_ptr<Data> data);
 
   // Create a new XLA tensor with the same metadata of the input tensor (with
@@ -164,7 +164,7 @@ class XLATensor : public torch::lazy::LazyTensor {
   XLATensorPtr CreateFrom(torch::lazy::Value ir_value) const;
   XLATensorPtr CreateFrom(
       torch::lazy::Value ir_value,
-      c10::optional<at::ScalarType> logical_element_type_opt) const;
+      std::optional<at::ScalarType> logical_element_type_opt) const;
   // TODO: We should remove this one once MaybeCastIrValue is no longer needed.
   XLATensorPtr CreateFrom(torch::lazy::Value ir_value,
                           const torch::lazy::BackendDevice& device,
@@ -198,10 +198,10 @@ class XLATensor : public torch::lazy::LazyTensor {
 
   // Override to use logical_element_type.
   at::ScalarType dtype() const final;
-  c10::optional<at::ScalarType> dtype_optional() const;
+  std::optional<at::ScalarType> dtype_optional() const;
 
   // Set logical_element_type which is visible to upstream PyTorch.
-  void SetScalarType(c10::optional<at::ScalarType> logical_element_type);
+  void SetScalarType(std::optional<at::ScalarType> logical_element_type);
 
   void MarkDynamicDimension(uint32_t dim);
   // We don't use the upstream shape to provide xla::shape.
@@ -231,7 +231,7 @@ class XLATensor : public torch::lazy::LazyTensor {
   void SetInPlaceIrValue(torch::lazy::Value ir_value);
 
   // TODO(alanwaketan): Reuse the upstream one once Functionalization is done.
-  c10::optional<at::Tensor> CurrentTensorData() const;
+  std::optional<at::Tensor> CurrentTensorData() const;
 
   // We don't use the upstream MakeOutputTensors to return XLATensorPtr instead.
   std::vector<XLATensorPtr> MakeOutputTensors(
@@ -297,18 +297,18 @@ class XLATensor : public torch::lazy::LazyTensor {
  private:
   XLATensor(const at::Tensor& tensor, const torch::lazy::BackendDevice& device);
   XLATensor(torch::lazy::BackendDataPtr handle,
-            c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+            std::optional<at::ScalarType> logical_element_type = std::nullopt);
   XLATensor(torch::lazy::Value ir_value,
             const torch::lazy::BackendDevice& device,
-            c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+            std::optional<at::ScalarType> logical_element_type = std::nullopt);
   XLATensor(std::shared_ptr<View> view,
             const torch::lazy::BackendDevice& device,
-            c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+            std::optional<at::ScalarType> logical_element_type = std::nullopt);
   XLATensor(std::shared_ptr<Data> data);
 
   static XLATensorPtr Create(
       std::shared_ptr<View> view, const torch::lazy::BackendDevice& device,
-      c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+      std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
   // TODO(alanwaketan): Reuse the upstream one once Functionalization is done.
   void SetXlaData(torch::lazy::BackendDataPtr handle, bool sync);
@@ -322,7 +322,7 @@ class XLATensor : public torch::lazy::LazyTensor {
 
   torch::lazy::Value MaybeCastIrValue(
       torch::lazy::Value ir_value, const torch::lazy::BackendDevice& device,
-      c10::optional<at::ScalarType> logical_element_type) const;
+      std::optional<at::ScalarType> logical_element_type) const;
 
   // Override to instantiate our own xla data.
   torch::lazy::Value GetIrValueForTensor(
diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
index a557f55690d..118e4ab788b 100644
--- a/torch_xla/csrc/tensor_methods.cpp
+++ b/torch_xla/csrc/tensor_methods.cpp
@@ -172,8 +172,8 @@ torch::lazy::Value MaybeExpand(const torch::lazy::Value& input,
 }
 
 MinMaxValues GetMinMaxValues(const XLATensorPtr& tensor,
-                             const c10::optional<at::Scalar>& min,
-                             const c10::optional<at::Scalar>& max) {
+                             const std::optional<at::Scalar>& min,
+                             const std::optional<at::Scalar>& max) {
   XLA_CHECK(min || max)
       << "At least one of \'min\' or \'max\' must not be None";
   xla::PrimitiveType raw_element_type = XlaTypeFromTorchType(tensor->dtype());
@@ -311,7 +311,7 @@ absl::optional<torch::lazy::Value> GetOptionalIrValue(
 ViewInfo CreateAsStridedViewInfo(const xla::Shape& input_shape,
                                  std::vector<int64_t> size,
                                  std::vector<int64_t> stride,
-                                 c10::optional<int64_t> storage_offset) {
+                                 std::optional<int64_t> storage_offset) {
   xla::Shape result_shape = XlaHelpers::GetDynamicReshape(input_shape, size);
   AsStridedInfo as_strided_info;
   as_strided_info.stride = std::move(stride);
@@ -757,25 +757,25 @@ void __irshift__(XLATensorPtr& input, const XLATensorPtr& other) {
 }
 
 XLATensorPtr __lshift__(const XLATensorPtr& input, const at::Scalar& other,
-                        c10::optional<at::ScalarType> logical_element_type) {
+                        std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Lshift(input->GetIrValue(), other),
                            logical_element_type);
 }
 
 XLATensorPtr __lshift__(const XLATensorPtr& input, const XLATensorPtr& other,
-                        c10::optional<at::ScalarType> logical_element_type) {
+                        std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Lshift(input->GetIrValue(), other->GetIrValue()),
                            logical_element_type);
 }
 
 XLATensorPtr __rshift__(const XLATensorPtr& input, const at::Scalar& other,
-                        c10::optional<at::ScalarType> logical_element_type) {
+                        std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Rshift(input->GetIrValue(), other),
                            logical_element_type);
 }
 
 XLATensorPtr __rshift__(const XLATensorPtr& input, const XLATensorPtr& other,
-                        c10::optional<at::ScalarType> logical_element_type) {
+                        std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Rshift(input->GetIrValue(), other->GetIrValue()),
                            logical_element_type);
 }
@@ -844,7 +844,7 @@ XLATensorPtr abs(const XLATensorPtr& input) {
 
 XLATensorPtr add(const XLATensorPtr& input, const XLATensorPtr& other,
                  const at::Scalar& alpha,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   xla::Shape input_shape = input->shape().get();
   xla::Shape other_shape = other->shape().get();
   torch::lazy::Value constant;
@@ -871,7 +871,7 @@ XLATensorPtr add(const XLATensorPtr& input, const XLATensorPtr& other,
 
 XLATensorPtr add(const XLATensorPtr& input, const at::Scalar& other,
                  const at::Scalar& alpha,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   const torch::lazy::BackendDevice& device = input->GetDevice();
   torch::lazy::Value other_constant =
       XLAGraphExecutor::Get()->GetIrValueForScalar(
@@ -906,7 +906,7 @@ void arange_out(XLATensorPtr& out, const at::Scalar& start,
 
 XLATensorPtr as_strided(const XLATensorPtr& input, std::vector<int64_t> size,
                         std::vector<int64_t> stride,
-                        c10::optional<int64_t> storage_offset) {
+                        std::optional<int64_t> storage_offset) {
   // See Note: [Disabling functionalization]
   if (runtime::sys_util::GetEnvBool("XLA_DISABLE_FUNCTIONALIZATION", false)) {
     auto input_shape = input->shape();
@@ -920,7 +920,7 @@ XLATensorPtr as_strided(const XLATensorPtr& input, std::vector<int64_t> size,
 
 void as_strided_(XLATensorPtr& input, std::vector<int64_t> size,
                  std::vector<int64_t> stride,
-                 c10::optional<int64_t> storage_offset) {
+                 std::optional<int64_t> storage_offset) {
   if (input->data()->view == nullptr) {
     input->SetIrValue(torch::lazy::MakeNode<AsStrided>(
         input->GetIrValue(), std::move(size), std::move(stride),
@@ -1078,12 +1078,12 @@ XLATensorPtr cdist_forward(const XLATensorPtr& x1, const XLATensorPtr& x2,
 }
 
 XLATensorPtr pdist_forward(const XLATensorPtr& input, double p) {
-  c10::optional<at::ScalarType> dtype = input->dtype_optional();
+  std::optional<at::ScalarType> dtype = input->dtype_optional();
   return input->CreateFrom(Pdist_forward(input->GetIrValue(), p, dtype));
 }
 
 XLATensorPtr pixel_shuffle(const XLATensorPtr& input, int64_t upscale_factor) {
-  c10::optional<at::ScalarType> dtype = input->dtype_optional();
+  std::optional<at::ScalarType> dtype = input->dtype_optional();
   torch::lazy::NodePtr node = PixelShuffle(input->GetIrValue(), upscale_factor);
   return input->CreateFrom(node, dtype);
 }
@@ -1097,8 +1097,8 @@ void celu_(XLATensorPtr& input, const at::Scalar& alpha) {
 }
 
 XLATensorPtr clamp(const XLATensorPtr& input,
-                   const c10::optional<at::Scalar>& min,
-                   const c10::optional<at::Scalar>& max) {
+                   const std::optional<at::Scalar>& min,
+                   const std::optional<at::Scalar>& max) {
   MinMaxValues min_max = GetMinMaxValues(input, min, max);
   return input->CreateFrom(
       Clamp(input->GetIrValue(), min_max.min, min_max.max));
@@ -1177,12 +1177,12 @@ XLATensorPtr count_nonzero(const XLATensorPtr& input,
 }
 
 XLATensorPtr cross(const XLATensorPtr& input, const XLATensorPtr& other,
-                   c10::optional<int64_t> dim) {
+                   std::optional<int64_t> dim) {
   return tensor_ops::Cross(input, other, dim);
 }
 
 XLATensorPtr cumprod(const XLATensorPtr& input, int64_t dim,
-                     c10::optional<at::ScalarType> dtype) {
+                     std::optional<at::ScalarType> dtype) {
   int64_t canonical_dim =
       torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().get().rank());
   if (!dtype) {
@@ -1194,7 +1194,7 @@ XLATensorPtr cumprod(const XLATensorPtr& input, int64_t dim,
 }
 
 XLATensorPtr cumsum(const XLATensorPtr& input, int64_t dim,
-                    c10::optional<at::ScalarType> dtype) {
+                    std::optional<at::ScalarType> dtype) {
   int64_t canonical_dim =
       torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().get().rank());
   if (!dtype) {
@@ -1238,8 +1238,8 @@ XLATensorPtr diagonal(const XLATensorPtr& input, int64_t offset, int64_t dim1,
 }
 
 XLATensorPtr div(const XLATensorPtr& input, const XLATensorPtr& other,
-                 const c10::optional<c10::string_view>& rounding_mode,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 const std::optional<c10::string_view>& rounding_mode,
+                 std::optional<at::ScalarType> logical_element_type) {
   at::ScalarType scalar_type =
       at::typeMetaToScalarType(c10::get_default_dtype());
   xla::PrimitiveType input_type = input->shape().get().element_type();
@@ -1435,7 +1435,7 @@ void fill_(XLATensorPtr& input, const at::Scalar& value) {
   // dynamic dimension hence we need to create a sym_int_elements here.
   SymIntElements sym_int_elements(input->GetIrValue());
   torch::lazy::Value constant = XLAGraphExecutor::Get()->GetIrValueForScalar(
-      value, input->shape(), sym_int_elements, c10::nullopt,
+      value, input->shape(), sym_int_elements, std::nullopt,
       input->GetDevice());
   input->SetInPlaceIrValue(std::move(constant));
 }
@@ -1451,13 +1451,13 @@ XLATensorPtr flip(const XLATensorPtr& input, absl::Span<const int64_t> dims) {
 }
 
 XLATensorPtr fmod(const XLATensorPtr& input, const XLATensorPtr& other,
-                  c10::optional<at::ScalarType> logical_element_type) {
+                  std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Fmod(input->GetIrValue(), other->GetIrValue()),
                            logical_element_type);
 }
 
 XLATensorPtr fmod(const XLATensorPtr& input, const at::Scalar& other,
-                  c10::optional<at::ScalarType> logical_element_type) {
+                  std::optional<at::ScalarType> logical_element_type) {
   torch::lazy::Value constant = XLAGraphExecutor::Get()->GetIrValueForScalar(
       other, input->shape(), logical_element_type, input->GetDevice());
   return input->CreateFrom(Fmod(input->GetIrValue(), constant),
@@ -1479,7 +1479,7 @@ XLATensorPtr full(absl::Span<const int64_t> size, const at::Scalar& fill_value,
 
 XLATensorPtr full_like(const XLATensorPtr& input, const at::Scalar& fill_value,
                        const torch::lazy::BackendDevice& device,
-                       c10::optional<at::ScalarType> scalar_type) {
+                       std::optional<at::ScalarType> scalar_type) {
   xla::Shape tensor_shape = input->shape();
   if (scalar_type) {
     tensor_shape.set_element_type(MakeXlaPrimitiveType(*scalar_type, &device));
@@ -1696,7 +1696,7 @@ XLATensorPtr lerp(const XLATensorPtr& input, const XLATensorPtr& end,
 XLATensorPtr linalg_vector_norm(const XLATensorPtr& input,
                                 const at::Scalar& ord,
                                 std::vector<int64_t> dimensions, bool keep_dim,
-                                c10::optional<at::ScalarType> dtype) {
+                                std::optional<at::ScalarType> dtype) {
   // If the input is a scalar, we have to manually create the dimensions vector.
   auto input_rank = input->shape().get().rank();
   std::vector<int64_t> canonical_dims;
@@ -1732,33 +1732,33 @@ XLATensorPtr linspace(const at::Scalar& start, const at::Scalar& end,
 }
 
 XLATensorPtr log(const XLATensorPtr& input) {
-  // Here we explictly pass c10::nullopt as logical_element_type because
+  // Here we explictly pass std::nullopt as logical_element_type because
   // otherwise result will inherit the input's logical_element_type. In the
   // case of log(int) -> float, we want to derive the dtype from IR value
   // instead of input's logical_element_type.
   return input->CreateFrom(
-      Log(GetFloatingIrValue(input, at::ScalarType::Float)), c10::nullopt);
+      Log(GetFloatingIrValue(input, at::ScalarType::Float)), std::nullopt);
 }
 
-XLATensorPtr logit(const XLATensorPtr& input, c10::optional<double> eps) {
-  // Here we explictly pass c10::nullopt as logical_element_type because
+XLATensorPtr logit(const XLATensorPtr& input, std::optional<double> eps) {
+  // Here we explictly pass std::nullopt as logical_element_type because
   // otherwise result will inherit the input's logical_element_type. In the
   // case of logit(int) -> float, we want to derive the dtype from IR value
   // instead of input's logical_element_type.
   return input->CreateFrom(
       Logit(GetFloatingIrValue(input, at::ScalarType::Float), eps),
-      c10::nullopt);
+      std::nullopt);
 }
 
 XLATensorPtr log_base(const XLATensorPtr& input, torch::lazy::OpKind op,
                       double base) {
-  // Here we explictly pass c10::nullopt as logical_element_type because
+  // Here we explictly pass std::nullopt as logical_element_type because
   // otherwise result will inherit the input's logical_element_type. In the
   // case of logbase(int) -> float, we want to derive the dtype from IR value
   // instead of input's logical_element_type.
   return input->CreateFrom(
       LogBase(GetFloatingIrValue(input, at::ScalarType::Float), op, base),
-      c10::nullopt);
+      std::nullopt);
 }
 
 XLATensorPtr log_sigmoid(const XLATensorPtr& input) {
@@ -1767,7 +1767,7 @@ XLATensorPtr log_sigmoid(const XLATensorPtr& input) {
 }
 
 XLATensorPtr log_softmax(const XLATensorPtr& input, int64_t dim,
-                         c10::optional<at::ScalarType> dtype,
+                         std::optional<at::ScalarType> dtype,
                          std::vector<torch::lazy::Shape>&& shapes) {
   if (!dtype) {
     dtype = input->dtype_optional();
@@ -1787,12 +1787,12 @@ XLATensorPtr log_softmax_backward(const XLATensorPtr& grad_output,
 }
 
 XLATensorPtr log1p(const XLATensorPtr& input) {
-  // Here we explictly pass c10::nullopt as logical_element_type because
+  // Here we explictly pass std::nullopt as logical_element_type because
   // otherwise result will inherit the input's logical_element_type. In the
   // case of log1p(int) -> float, we want to derive the dtype from IR value
   // instead of input's logical_element_type.
   return input->CreateFrom(
-      Log1p(GetFloatingIrValue(input, at::ScalarType::Float)), c10::nullopt);
+      Log1p(GetFloatingIrValue(input, at::ScalarType::Float)), std::nullopt);
 }
 
 void log1p_(XLATensorPtr& input) {
@@ -1811,14 +1811,14 @@ XLATensorPtr logsumexp(const XLATensorPtr& input,
 }
 
 XLATensorPtr xlogy(const XLATensorPtr& input, const XLATensorPtr& other) {
-  // Here we explictly pass c10::nullopt as logical_element_type because
+  // Here we explictly pass std::nullopt as logical_element_type because
   // otherwise result will inherit the input's logical_element_type. In the
   // case of xlogy(int,int) -> float, we want to derive the dtype from IR value
   // instead of input's logical_element_type.
   return input->CreateFrom(
       XLogY(input->GetIrValue(),
             GetFloatingIrValue(other, at::ScalarType::Float)),
-      c10::nullopt);
+      std::nullopt);
 }
 
 XLATensorPtr lt(const XLATensorPtr& input, const at::Scalar& other) {
@@ -1924,7 +1924,7 @@ XLATensorPtr max_unpool(const XLATensorPtr& input, const XLATensorPtr& indices,
 
 XLATensorPtr mean(const XLATensorPtr& input, std::vector<int64_t> dimensions,
                   bool keep_reduced_dimensions,
-                  c10::optional<at::ScalarType> dtype) {
+                  std::optional<at::ScalarType> dtype) {
   if (!dtype) {
     dtype = input->dtype_optional();
   }
@@ -1939,7 +1939,7 @@ XLATensorPtr mean(const XLATensorPtr& input, std::vector<int64_t> dimensions,
 }
 
 XLATensorPtr min(const XLATensorPtr& input, const XLATensorPtr& other,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Min(input->GetIrValue(), other->GetIrValue()),
                            logical_element_type);
 }
@@ -1982,14 +1982,14 @@ XLATensorPtr mm(const XLATensorPtr& input, const XLATensorPtr& weight) {
 
 XLATensorPtr mse_loss(const XLATensorPtr& input, const XLATensorPtr& target,
                       int64_t reduction) {
-  // Here we explictly pass c10::nullopt as logical_element_type because
+  // Here we explictly pass std::nullopt as logical_element_type because
   // otherwise result will inherit the input's logical_element_type. In the
   // case of mse_loss(long, float16) -> float16, we want to derive the dtype
   // from IR value instead of input's logical_element_type.
   return input->CreateFrom(
       torch::lazy::MakeNode<MseLoss>(input->GetIrValue(), target->GetIrValue(),
                                      GetXlaReductionMode(reduction)),
-      c10::nullopt);
+      std::nullopt);
 }
 
 XLATensorPtr mse_loss_backward(const XLATensorPtr& grad_output,
@@ -2001,13 +2001,13 @@ XLATensorPtr mse_loss_backward(const XLATensorPtr& grad_output,
 }
 
 XLATensorPtr mul(const XLATensorPtr& input, const XLATensorPtr& other,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   return input->CreateFrom(Mul(input->GetIrValue(), other->GetIrValue()),
                            logical_element_type);
 }
 
 XLATensorPtr mul(const XLATensorPtr& input, const at::Scalar& other,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   const torch::lazy::BackendDevice& device = input->GetDevice();
   torch::lazy::Value constant = XLAGraphExecutor::Get()->GetIrValueForScalar(
       other,
@@ -2138,7 +2138,7 @@ std::tuple<XLATensorPtr, XLATensorPtr, XLATensorPtr> native_batch_norm_backward(
 }
 
 std::tuple<XLATensorPtr, XLATensorPtr> native_dropout(
-    const XLATensorPtr& input, double p, c10::optional<bool> train) {
+    const XLATensorPtr& input, double p, std::optional<bool> train) {
   torch::lazy::NodePtr node = torch::lazy::MakeNode<NativeDropout>(
       input->GetIrValue(),
       XLAGraphExecutor::Get()->GetRngSeed(input->GetDevice()), p, train);
@@ -2217,8 +2217,8 @@ XLATensorPtr nonzero(const XLATensorPtr& input) {
   return XLATensor::Create(torch::lazy::Value(node, 0), input->GetDevice());
 }
 
-XLATensorPtr norm(const XLATensorPtr& input, const c10::optional<at::Scalar>& p,
-                  c10::optional<at::ScalarType> dtype, at::IntArrayRef dim,
+XLATensorPtr norm(const XLATensorPtr& input, const std::optional<at::Scalar>& p,
+                  std::optional<at::ScalarType> dtype, at::IntArrayRef dim,
                   bool keepdim) {
   auto canonical_dims = torch::lazy::GetCanonicalDimensionIndices(
       XlaHelpers::I64List(dim), input->shape().get().rank());
@@ -2297,7 +2297,7 @@ XLATensorPtr permute(const XLATensorPtr& input,
 }
 
 XLATensorPtr pow(const XLATensorPtr& input, const at::Scalar& exponent,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   // We want to pass exponent_node as a constant to give XLA more room to
   // optimize.
   at::ScalarType type =
@@ -2311,7 +2311,7 @@ XLATensorPtr pow(const XLATensorPtr& input, const at::Scalar& exponent,
 }
 
 XLATensorPtr pow(const XLATensorPtr& input, const XLATensorPtr& exponent,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   at::ScalarType type =
       logical_element_type
           ? *logical_element_type
@@ -2322,7 +2322,7 @@ XLATensorPtr pow(const XLATensorPtr& input, const XLATensorPtr& exponent,
 }
 
 XLATensorPtr pow(const at::Scalar& input, const XLATensorPtr& exponent,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   at::ScalarType type =
       logical_element_type
           ? *logical_element_type
@@ -2348,7 +2348,7 @@ std::tuple<XLATensorPtr, XLATensorPtr> prelu_backward(
 
 XLATensorPtr prod(const XLATensorPtr& input, std::vector<int64_t> dimensions,
                   bool keep_reduced_dimensions,
-                  c10::optional<at::ScalarType> dtype) {
+                  std::optional<at::ScalarType> dtype) {
   if (!dtype) {
     dtype = input->dtype_optional();
   }
@@ -2602,7 +2602,7 @@ XLATensorPtr rrelu_with_noise_backward(const XLATensorPtr& grad_output,
 
 XLATensorPtr rsub(const XLATensorPtr& input, const XLATensorPtr& other,
                   const at::Scalar& alpha,
-                  c10::optional<at::ScalarType> logical_element_type) {
+                  std::optional<at::ScalarType> logical_element_type) {
   const torch::lazy::BackendDevice& device = input->GetDevice();
   torch::lazy::Value alpha_xla = XLAGraphExecutor::Get()->GetIrValueForScalar(
       alpha,
@@ -2617,7 +2617,7 @@ XLATensorPtr rsub(const XLATensorPtr& input, const XLATensorPtr& other,
 
 XLATensorPtr rsub(const XLATensorPtr& input, const at::Scalar& other,
                   const at::Scalar& alpha,
-                  c10::optional<at::ScalarType> logical_element_type) {
+                  std::optional<at::ScalarType> logical_element_type) {
   const torch::lazy::BackendDevice& device = input->GetDevice();
   torch::lazy::Value other_xla = XLAGraphExecutor::Get()->GetIrValueForScalar(
       other,
@@ -2777,7 +2777,7 @@ XLATensorPtr smooth_l1_loss_backward(const XLATensorPtr& grad_output,
 }
 
 XLATensorPtr softmax(const XLATensorPtr& input, int64_t dim,
-                     c10::optional<at::ScalarType> dtype) {
+                     std::optional<at::ScalarType> dtype) {
   if (!dtype) {
     dtype = input->dtype_optional();
   }
@@ -2919,7 +2919,7 @@ std::tuple<XLATensorPtr, XLATensorPtr> std_mean(const XLATensorPtr& input,
 
 XLATensorPtr sub(const XLATensorPtr& input, const XLATensorPtr& other,
                  const at::Scalar& alpha,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   xla::Shape input_shape = input->shape().get();
   xla::Shape other_shape = other->shape().get();
   torch::lazy::Value alpha_xla;
@@ -2946,7 +2946,7 @@ XLATensorPtr sub(const XLATensorPtr& input, const XLATensorPtr& other,
 
 XLATensorPtr sub(const XLATensorPtr& input, const at::Scalar& other,
                  const at::Scalar& alpha,
-                 c10::optional<at::ScalarType> logical_element_type) {
+                 std::optional<at::ScalarType> logical_element_type) {
   torch::lazy::Value other_xla = XLAGraphExecutor::Get()->GetIrValueForScalar(
       other, input->shape(), logical_element_type, input->GetDevice());
   torch::lazy::Value alpha_xla = XLAGraphExecutor::Get()->GetIrValueForScalar(
@@ -2958,7 +2958,7 @@ XLATensorPtr sub(const XLATensorPtr& input, const at::Scalar& other,
 
 XLATensorPtr sum(const XLATensorPtr& input, std::vector<int64_t> dimensions,
                  bool keep_reduced_dimensions,
-                 c10::optional<at::ScalarType> dtype) {
+                 std::optional<at::ScalarType> dtype) {
   if (at::isIntegralType(input->dtype(), /*includeBool=*/true) && !dtype) {
     dtype = at::ScalarType::Long;
   } else if (!dtype) {
@@ -3001,8 +3001,8 @@ XLATensorPtr threshold_backward(const XLATensorPtr& grad_output,
 }
 
 XLATensorPtr to(XLATensorPtr& input,
-                c10::optional<torch::lazy::BackendDevice> device,
-                c10::optional<at::ScalarType> scalar_type) {
+                std::optional<torch::lazy::BackendDevice> device,
+                std::optional<at::ScalarType> scalar_type) {
   if (!device) {
     device = input->GetDevice();
   }
diff --git a/torch_xla/csrc/tensor_methods.h b/torch_xla/csrc/tensor_methods.h
index 11df2c6eb74..2341bac4a68 100644
--- a/torch_xla/csrc/tensor_methods.h
+++ b/torch_xla/csrc/tensor_methods.h
@@ -169,17 +169,17 @@ void __irshift__(XLATensorPtr& input, const XLATensorPtr& other);
 
 XLATensorPtr __lshift__(
     const XLATensorPtr& input, const at::Scalar& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr __lshift__(
     const XLATensorPtr& input, const XLATensorPtr& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr __rshift__(
     const XLATensorPtr& input, const at::Scalar& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr __rshift__(
     const XLATensorPtr& input, const XLATensorPtr& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 std::tuple<XLATensorPtr, XLATensorPtr> adaptive_max_pool2d(
     const XLATensorPtr& input, std::vector<int64_t> output_size);
@@ -208,10 +208,10 @@ XLATensorPtr abs(const XLATensorPtr& input);
 XLATensorPtr add(
     const XLATensorPtr& input, const XLATensorPtr& other,
     const at::Scalar& alpha,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr add(
     const XLATensorPtr& input, const at::Scalar& other, const at::Scalar& alpha,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr addcdiv(const XLATensorPtr& input, const at::Scalar& value,
                      const XLATensorPtr& tensor1, const XLATensorPtr& tensor2);
@@ -240,12 +240,12 @@ void arange_out(XLATensorPtr& out, const at::Scalar& start,
 // into the provided size.
 XLATensorPtr as_strided(const XLATensorPtr& input, std::vector<int64_t> size,
                         std::vector<int64_t> stride,
-                        c10::optional<int64_t> storage_offset);
+                        std::optional<int64_t> storage_offset);
 
 // In-place version of the method above.
 void as_strided_(XLATensorPtr& input, std::vector<int64_t> size,
                  std::vector<int64_t> stride,
-                 c10::optional<int64_t> storage_offset);
+                 std::optional<int64_t> storage_offset);
 
 XLATensorPtr avg_pool_nd(const XLATensorPtr& input, int64_t spatial_dim_count,
                          std::vector<int64_t> kernel_size,
@@ -307,11 +307,11 @@ XLATensorPtr celu(const XLATensorPtr& input, const at::Scalar& alpha);
 void celu_(XLATensorPtr& input, const at::Scalar& alpha);
 
 XLATensorPtr clamp(const XLATensorPtr& input,
-                   const c10::optional<at::Scalar>& min,
-                   const c10::optional<at::Scalar>& max);
+                   const std::optional<at::Scalar>& min,
+                   const std::optional<at::Scalar>& max);
 XLATensorPtr clamp(const XLATensorPtr& input,
-                   const c10::optional<at::Tensor>& min,
-                   const c10::optional<at::Tensor>& max);
+                   const std::optional<at::Tensor>& min,
+                   const std::optional<at::Tensor>& max);
 
 XLATensorPtr clone(const XLATensorPtr& input);
 
@@ -347,15 +347,15 @@ XLATensorPtr count_nonzero(const XLATensorPtr& input,
 // If the dimension is not given, it defaults to the first dimension found
 // with the size 3.
 XLATensorPtr cross(const XLATensorPtr& input, const XLATensorPtr& other,
-                   c10::optional<int64_t> dim);
+                   std::optional<int64_t> dim);
 
 // Returns the cumulative product of elements of input in the given dimension.
 XLATensorPtr cumprod(const XLATensorPtr& input, int64_t dim,
-                     c10::optional<at::ScalarType> dtype);
+                     std::optional<at::ScalarType> dtype);
 
 // Returns the cumulative sum of elements of input in the given dimension.
 XLATensorPtr cumsum(const XLATensorPtr& input, int64_t dim,
-                    c10::optional<at::ScalarType> dtype);
+                    std::optional<at::ScalarType> dtype);
 
 // If the input is a matrix (2-D tensor), returns a 1-D tensor with the
 // diagonal elements of the input. If the input is a vector (1-D tensor),
@@ -369,8 +369,8 @@ XLATensorPtr diagonal(const XLATensorPtr& input, int64_t offset, int64_t dim1,
 
 XLATensorPtr div(
     const XLATensorPtr& input, const XLATensorPtr& other,
-    const c10::optional<c10::string_view>& rounding_mode = c10::nullopt,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    const std::optional<c10::string_view>& rounding_mode = std::nullopt,
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr div(const XLATensorPtr& input, const at::Scalar& other);
 
 // A generalized contraction between tensors of arbitrary dimension defined by
@@ -427,17 +427,17 @@ XLATensorPtr flip(const XLATensorPtr& input, absl::Span<const int64_t> dims);
 
 XLATensorPtr fmod(
     const XLATensorPtr& input, const XLATensorPtr& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr fmod(
     const XLATensorPtr& input, const at::Scalar& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr full(absl::Span<const int64_t> size, const at::Scalar& fill_value,
                   const torch::lazy::BackendDevice& device,
                   at::ScalarType scalar_type);
 XLATensorPtr full_like(const XLATensorPtr& input, const at::Scalar& fill_value,
                        const torch::lazy::BackendDevice& device,
-                       c10::optional<at::ScalarType> scalar_type);
+                       std::optional<at::ScalarType> scalar_type);
 XLATensorPtr full_symint(at::SymIntArrayRef sym_size,
                          const at::Scalar& fill_value,
                          const torch::lazy::BackendDevice& device,
@@ -535,7 +535,7 @@ XLATensorPtr lerp(const XLATensorPtr& input, const XLATensorPtr& end,
 XLATensorPtr linalg_vector_norm(const XLATensorPtr& input,
                                 const at::Scalar& ord,
                                 std::vector<int64_t> dimensions, bool keep_dim,
-                                c10::optional<at::ScalarType> dtype);
+                                std::optional<at::ScalarType> dtype);
 
 XLATensorPtr linspace(const at::Scalar& start, const at::Scalar& end,
                       const int64_t steps, at::ScalarType element_type,
@@ -543,7 +543,7 @@ XLATensorPtr linspace(const at::Scalar& start, const at::Scalar& end,
 
 XLATensorPtr log(const XLATensorPtr& input);
 
-XLATensorPtr logit(const XLATensorPtr& input, c10::optional<double> eps);
+XLATensorPtr logit(const XLATensorPtr& input, std::optional<double> eps);
 
 XLATensorPtr log_base(const XLATensorPtr& input, torch::lazy::OpKind op,
                       double base);
@@ -551,7 +551,7 @@ XLATensorPtr log_base(const XLATensorPtr& input, torch::lazy::OpKind op,
 XLATensorPtr log_sigmoid(const XLATensorPtr& input);
 
 XLATensorPtr log_softmax(const XLATensorPtr& input, int64_t dim,
-                         c10::optional<at::ScalarType> dtype,
+                         std::optional<at::ScalarType> dtype,
                          std::vector<torch::lazy::Shape>&& shapes);
 
 XLATensorPtr log_softmax_backward(const XLATensorPtr& grad_output,
@@ -609,11 +609,11 @@ XLATensorPtr max_unpool_backward(const XLATensorPtr& grad_output,
 
 XLATensorPtr mean(const XLATensorPtr& input, std::vector<int64_t> dimensions,
                   bool keep_reduced_dimensions,
-                  c10::optional<at::ScalarType> dtype);
+                  std::optional<at::ScalarType> dtype);
 
 XLATensorPtr min(
     const XLATensorPtr& input, const XLATensorPtr& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr min(const XLATensorPtr& input);
 
@@ -636,10 +636,10 @@ XLATensorPtr mse_loss_backward(const XLATensorPtr& grad_output,
 
 XLATensorPtr mul(
     const XLATensorPtr& input, const XLATensorPtr& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr mul(
     const XLATensorPtr& input, const at::Scalar& other,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr multinomial(const XLATensorPtr& input, int64_t num_samples,
                          bool replacement);
@@ -670,7 +670,7 @@ std::tuple<XLATensorPtr, XLATensorPtr, XLATensorPtr> native_batch_norm_backward(
     const XLATensorPtr& save_invstd, bool training, double eps);
 
 std::tuple<XLATensorPtr, XLATensorPtr> native_dropout(
-    const XLATensorPtr& input, double p, c10::optional<bool> train);
+    const XLATensorPtr& input, double p, std::optional<bool> train);
 
 XLATensorPtr ne(const XLATensorPtr& input, const at::Scalar& other);
 
@@ -705,8 +705,8 @@ XLATensorPtr nms(const XLATensorPtr& boxes, const XLATensorPtr& scores,
 
 XLATensorPtr nonzero(const XLATensorPtr& input);
 
-XLATensorPtr norm(const XLATensorPtr& input, const c10::optional<at::Scalar>& p,
-                  c10::optional<at::ScalarType> dtype, at::IntArrayRef dim,
+XLATensorPtr norm(const XLATensorPtr& input, const std::optional<at::Scalar>& p,
+                  std::optional<at::ScalarType> dtype, at::IntArrayRef dim,
                   bool keepdim);
 
 XLATensorPtr normal(double mean, const XLATensorPtr& std);
@@ -727,13 +727,13 @@ XLATensorPtr permute(const XLATensorPtr& input, absl::Span<const int64_t> dims);
 
 XLATensorPtr pow(
     const XLATensorPtr& input, const at::Scalar& exponent,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr pow(
     const XLATensorPtr& input, const XLATensorPtr& exponent,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr pow(
     const at::Scalar& input, const XLATensorPtr& exponent,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr prelu(const XLATensorPtr& input, const XLATensorPtr& weight);
 
@@ -743,7 +743,7 @@ std::tuple<XLATensorPtr, XLATensorPtr> prelu_backward(
 
 XLATensorPtr prod(const XLATensorPtr& input, std::vector<int64_t> dimensions,
                   bool keep_reduced_dimensions,
-                  c10::optional<at::ScalarType> dtype);
+                  std::optional<at::ScalarType> dtype);
 
 void put_(XLATensorPtr& input, const XLATensorPtr& index,
           const XLATensorPtr& source, bool accumulate);
@@ -815,10 +815,10 @@ XLATensorPtr rrelu_with_noise_backward(const XLATensorPtr& grad_output,
 XLATensorPtr rsub(
     const XLATensorPtr& input, const XLATensorPtr& other,
     const at::Scalar& alpha,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr rsub(
     const XLATensorPtr& input, const at::Scalar& other, const at::Scalar& alpha,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 void copy_(XLATensorPtr& input, XLATensorPtr& src);
 
@@ -864,7 +864,7 @@ XLATensorPtr smooth_l1_loss_backward(const XLATensorPtr& grad_output,
                                      int64_t reduction, double beta);
 
 XLATensorPtr softmax(const XLATensorPtr& input, int64_t dim,
-                     c10::optional<at::ScalarType> dtype);
+                     std::optional<at::ScalarType> dtype);
 XLATensorPtr softmax_backward(const XLATensorPtr& grad_output,
                               const XLATensorPtr& output, int64_t dim);
 
@@ -915,14 +915,14 @@ std::tuple<XLATensorPtr, XLATensorPtr> std_mean(const XLATensorPtr& input,
 XLATensorPtr sub(
     const XLATensorPtr& input, const XLATensorPtr& other,
     const at::Scalar& alpha,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 XLATensorPtr sub(
     const XLATensorPtr& input, const at::Scalar& other, const at::Scalar& alpha,
-    c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
+    std::optional<at::ScalarType> logical_element_type = std::nullopt);
 
 XLATensorPtr sum(const XLATensorPtr& input, std::vector<int64_t> dimensions,
                  bool keep_reduced_dimensions,
-                 c10::optional<at::ScalarType> dtype);
+                 std::optional<at::ScalarType> dtype);
 
 std::tuple<XLATensorPtr, XLATensorPtr, XLATensorPtr> svd(
     const XLATensorPtr& input, bool some, bool compute_uv);
@@ -938,8 +938,8 @@ XLATensorPtr threshold_backward(const XLATensorPtr& grad_output,
                                 const XLATensorPtr& input, float threshold);
 
 XLATensorPtr to(XLATensorPtr& input,
-                c10::optional<torch::lazy::BackendDevice> device,
-                c10::optional<at::ScalarType> scalar_type);
+                std::optional<torch::lazy::BackendDevice> device,
+                std::optional<at::ScalarType> scalar_type);
 
 std::tuple<XLATensorPtr, XLATensorPtr> topk(const XLATensorPtr& input,
                                             int64_t k, int64_t dim,
diff --git a/torch_xla/csrc/tensor_ops.cpp b/torch_xla/csrc/tensor_ops.cpp
index a66ee923475..676ec730bbc 100644
--- a/torch_xla/csrc/tensor_ops.cpp
+++ b/torch_xla/csrc/tensor_ops.cpp
@@ -26,7 +26,7 @@ XLATensorPtr IndexAcrossDims(const XLATensorPtr& input, int64_t dim,
 }  // namespace
 
 XLATensorPtr Cross(const XLATensorPtr& input, const XLATensorPtr& other,
-                   c10::optional<int64_t> dim) {
+                   std::optional<int64_t> dim) {
   int64_t canonical_dim;
   if (dim) {
     canonical_dim = torch::lazy::GetCanonicalDimensionIndex(
diff --git a/torch_xla/csrc/tensor_ops.h b/torch_xla/csrc/tensor_ops.h
index 20a345f2936..ea35570cdc2 100644
--- a/torch_xla/csrc/tensor_ops.h
+++ b/torch_xla/csrc/tensor_ops.h
@@ -12,7 +12,7 @@ namespace torch_xla {
 namespace tensor_ops {
 
 XLATensorPtr Cross(const XLATensorPtr& input, const XLATensorPtr& other,
-                   c10::optional<int64_t> dim);
+                   std::optional<int64_t> dim);
 
 XLATensorPtr MakeMatrixWithDiagonal(const XLATensorPtr& input,
                                     int64_t diagonal);
diff --git a/torch_xla/csrc/torch_util.h b/torch_xla/csrc/torch_util.h
index c0768adad15..567fa0550b0 100644
--- a/torch_xla/csrc/torch_util.h
+++ b/torch_xla/csrc/torch_util.h
@@ -67,8 +67,8 @@ at::Tensor UnwrapNumber(const at::Tensor& tensor, at::ScalarType dtype);
 // only unwrap tensors that are functional. So, nothing needs to be done there.
 at::Tensor MaybeWrapTensorToFunctional(const at::Tensor& tensor);
 
-// Checks whether a c10::optional<Tensor> is defined.
-inline bool IsDefined(const c10::optional<at::Tensor>& tensor) {
+// Checks whether a std::optional<Tensor> is defined.
+inline bool IsDefined(const std::optional<at::Tensor>& tensor) {
   return tensor.has_value() && tensor.value().defined();
 }
 
diff --git a/torch_xla/csrc/xla_backend_impl.cpp b/torch_xla/csrc/xla_backend_impl.cpp
index 8b003772f2a..20d1b4be6a6 100644
--- a/torch_xla/csrc/xla_backend_impl.cpp
+++ b/torch_xla/csrc/xla_backend_impl.cpp
@@ -90,7 +90,7 @@ class XlaBackendImpl : public torch::lazy::BackendImplInterface {
 
   at::Tensor MakeTensorFromComputationData(
       const torch::lazy::BackendDataPtr data,
-      c10::optional<at::ScalarType> logical_scalar_type) const override {
+      std::optional<at::ScalarType> logical_scalar_type) const override {
     // TODO(JackCaoG): handle the logical_scalar_type == nullptr case
     return XlaDataToTensors({data}, {*logical_scalar_type})[0];
   }
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
index 61141189980..55507050a7e 100644
--- a/torch_xla/csrc/xla_graph_executor.cpp
+++ b/torch_xla/csrc/xla_graph_executor.cpp
@@ -311,7 +311,7 @@ torch::lazy::Value XLAGraphExecutor::GetIrValueForScalar(
 
 torch::lazy::Value XLAGraphExecutor::GetIrValueForScalar(
     const at::Scalar& value, const xla::Shape& shape,
-    c10::optional<at::ScalarType> logical_element_type,
+    std::optional<at::ScalarType> logical_element_type,
     const torch::lazy::BackendDevice& device) {
   xla::PrimitiveType type =
       logical_element_type
@@ -323,7 +323,7 @@ torch::lazy::Value XLAGraphExecutor::GetIrValueForScalar(
 torch::lazy::Value XLAGraphExecutor::GetIrValueForScalar(
     const at::Scalar& value, const xla::Shape& shape,
     SymIntElements size_elements,
-    c10::optional<at::ScalarType> logical_element_type,
+    std::optional<at::ScalarType> logical_element_type,
     const torch::lazy::BackendDevice& device) {
   xla::PrimitiveType primitive_type =
       logical_element_type
@@ -597,7 +597,7 @@ void XLAGraphExecutor::ClearPendingIrs(
         }
         tensors[i]->AssignIrValue(torch::lazy::Value());
         tensors[i]->data()->view = nullptr;
-        tensors[i]->data()->tensor_data = c10::nullopt;
+        tensors[i]->data()->tensor_data = std::nullopt;
       }
     }
   }
@@ -673,7 +673,7 @@ XLAGraphExecutor::SyncTensorCollection XLAGraphExecutor::CollectSyncTensors(
       } else if (config.force_ltc_data) {
         // The tensor only has at::Tensor data. We need to queue it for a
         // device upload.
-        c10::optional<at::Tensor> tensor_data = tensors[i]->CurrentTensorData();
+        std::optional<at::Tensor> tensor_data = tensors[i]->CurrentTensorData();
         XLA_CHECK(tensor_data);
         at_tensors.push_back(*tensor_data);
         shardings.push_back(tensors[i]->sharding_spec());
@@ -997,7 +997,7 @@ std::vector<torch::lazy::BackendDataPtr> XLAGraphExecutor::SetTensorData(
       // of ExtractIRAndPrepareXlaData_ to overlap with previous execution.
       tensor->data()->handle = handle;
       tensor->data()->view = nullptr;
-      tensor->data()->tensor_data = c10::nullopt;
+      tensor->data()->tensor_data = std::nullopt;
       tensor->data()->is_cloned = false;
     }
     tensors_data.emplace_back(std::move(handle));
@@ -1056,7 +1056,7 @@ std::vector<at::Tensor> XLAGraphExecutor::FetchTensors(
       ++literals_index;
       ++sync_index;
     } else {
-      c10::optional<at::Tensor> tensor_data =
+      std::optional<at::Tensor> tensor_data =
           (*tensors)[i]->CurrentTensorData();
       if (tensor_data) {
         results.push_back(*tensor_data);
diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h
index b2b76b8ae33..3baf7d83063 100644
--- a/torch_xla/csrc/xla_graph_executor.h
+++ b/torch_xla/csrc/xla_graph_executor.h
@@ -80,12 +80,12 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
       const torch::lazy::BackendDevice& device);
   torch::lazy::Value GetIrValueForScalar(
       const at::Scalar& value, const xla::Shape& shape,
-      c10::optional<at::ScalarType> logical_element_type,
+      std::optional<at::ScalarType> logical_element_type,
       const torch::lazy::BackendDevice& device);
   torch::lazy::Value GetIrValueForScalar(
       const at::Scalar& value, const xla::Shape& shape,
       SymIntElements size_elements,
-      c10::optional<at::ScalarType> logical_element_type,
+      std::optional<at::ScalarType> logical_element_type,
       const torch::lazy::BackendDevice& device);
 
   // Override to use our own DeviceContextArena.
diff --git a/torch_xla/csrc/xla_lower_util.cpp b/torch_xla/csrc/xla_lower_util.cpp
index 33c5492b46b..03cfc95777b 100644
--- a/torch_xla/csrc/xla_lower_util.cpp
+++ b/torch_xla/csrc/xla_lower_util.cpp
@@ -515,7 +515,7 @@ xla::XlaOp BuildDropout(xla::XlaOp input, float probability, xla::XlaOp seed) {
 
 std::vector<xla::XlaOp> BuildNativeDropout(xla::XlaOp input, xla::XlaOp seed,
                                            float probability,
-                                           c10::optional<bool> train) {
+                                           std::optional<bool> train) {
   const xla::Shape& shape = ShapeHelper::ShapeOfXlaOp(input);
   if (!train.has_value() || *train) {
     xla::XlaOp prob = XlaHelpers::ScalarBroadcast<float>(1 - probability, shape,
diff --git a/torch_xla/csrc/xla_lower_util.h b/torch_xla/csrc/xla_lower_util.h
index 400c8a51731..8fef6096803 100644
--- a/torch_xla/csrc/xla_lower_util.h
+++ b/torch_xla/csrc/xla_lower_util.h
@@ -46,7 +46,7 @@ xla::XlaOp BuildDropout(xla::XlaOp input, float probability, xla::XlaOp seed);
 
 std::vector<xla::XlaOp> BuildNativeDropout(xla::XlaOp input, xla::XlaOp seed,
                                            float probability,
-                                           c10::optional<bool> train);
+                                           std::optional<bool> train);
 
 xla::XlaOp BuildSigmoidBackward(xla::XlaOp grad_output, xla::XlaOp output,
                                 xla::XlaOp scalar_1);