Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion xla/backends/gpu/autotuner/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1005,12 +1005,12 @@ cc_library(
"//xla/service/gpu:gpu_conv_runner",
"//xla/service/gpu:stream_executor_util",
"//xla/stream_executor:device_address",
"//xla/stream_executor:device_address_allocator",
"//xla/stream_executor:dnn",
"//xla/stream_executor:engine_options",
"//xla/stream_executor:scratch_allocator",
"//xla/stream_executor:stream",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/tsl/platform:errors",
"//xla/tsl/platform:status_macros",
"//xla/tsl/platform:statusor",
Expand Down Expand Up @@ -1180,6 +1180,7 @@ xla_test(
"//xla/stream_executor:device_description_proto_cc",
"//xla/stream_executor:platform",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/stream_executor/rocm:rocm_platform_id",
"//xla/tsl/lib/core:status_test_util",
"//xla/tsl/platform:statusor",
Expand Down Expand Up @@ -1212,6 +1213,7 @@ xla_test(
"//xla/stream_executor:platform",
"//xla/stream_executor:platform_manager",
"//xla/stream_executor:stream_executor_h",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/stream_executor/platform:platform_object_registry",
"//xla/tsl/platform:statusor",
"@com_google_absl//absl/strings",
Expand Down
12 changes: 6 additions & 6 deletions xla/backends/gpu/autotuner/autotuner_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,16 @@ absl::Status Autotune(HloModule& module) {
DebugOptions debug_options = GetDebugOptionsFromFlags();
Compiler::GpuTargetConfig target_config(stream_executor);

std::unique_ptr<se::DeviceAddressAllocator> allocator =
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
stream_executor);

mlir::MLIRContext mlir_context;
xla::RegisterSymbolicExprStorage(&mlir_context);
TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<CodegenBackend>> backends,
gpu_compiler->GetAutotunerBackends(
stream_executor, &target_config, alias_info.get(),
debug_options, &mlir_context));

std::unique_ptr<se::DeviceAddressAllocator> allocator =
std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
stream_executor);
stream_executor, allocator.get(), &target_config,
alias_info.get(), debug_options, &mlir_context));

tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "autotuner",
tsl::port::MaxParallelism());
Expand Down
3 changes: 2 additions & 1 deletion xla/backends/gpu/autotuner/factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ namespace gpu {
// returned.
struct GetCodegenBackends {
using Type = std::function<std::vector<std::unique_ptr<CodegenBackend>>(
stream_executor::StreamExecutor*, const DebugOptions*, Compiler*,
stream_executor::StreamExecutor*,
stream_executor::DeviceAddressAllocator*, const DebugOptions*, Compiler*,
const Compiler::GpuTargetConfig*, const AliasInfo* alias_info,
mlir::MLIRContext* mlir_context,
absl::Span<const autotuner::Backend> backend_allowlist)>;
Expand Down
1 change: 1 addition & 0 deletions xla/backends/gpu/autotuner/factory_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ std::unique_ptr<HloPassPipeline> GetCustomKernelRewriterPipeline(

std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForCuda(
stream_executor::StreamExecutor* stream_executor,
stream_executor::DeviceAddressAllocator* device_allocator,
const DebugOptions* debug_options, Compiler* compiler,
const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
MLIRContext* mlir_context,
Expand Down
6 changes: 4 additions & 2 deletions xla/backends/gpu/autotuner/factory_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ using ::mlir::MLIRContext;

std::vector<std::unique_ptr<CodegenBackend>> GetCodegenBackendsForROCm(
stream_executor::StreamExecutor* stream_executor,
stream_executor::DeviceAddressAllocator* device_allocator,
const DebugOptions* debug_options, Compiler* compiler,
const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
MLIRContext* mlir_context,
absl::Span<const autotuner::Backend> backend_allowlist) {
std::vector<std::unique_ptr<CodegenBackend>> backends;
backends.push_back(std::make_unique<TritonBackend>(
debug_options, compiler, target_config, alias_info, mlir_context));
backends.push_back(std::make_unique<MIOpenBackend>(
stream_executor, debug_options, compiler, target_config));
backends.push_back(
std::make_unique<MIOpenBackend>(stream_executor, debug_options, compiler,
target_config, device_allocator));
backends.push_back(std::make_unique<RocblasBackend>(
stream_executor, debug_options, compiler, target_config));
backends.push_back(std::make_unique<HipblasLtBackend>(
Expand Down
11 changes: 7 additions & 4 deletions xla/backends/gpu/autotuner/factory_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ limitations under the License.
#include "xla/stream_executor/platform/platform_object_registry.h"
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_memory_allocator.h"
#include "xla/tsl/platform/statusor.h"

namespace xla {
Expand All @@ -55,6 +56,7 @@ class FactoryTest : public xla::HloHardwareIndependentTestBase,
se::StreamExecutor* stream_executor_;
Compiler::GpuTargetConfig target_config_;
DebugOptions debug_options_;
se::StreamExecutorMemoryAllocator allocator_;

FactoryTest()
: platform_(se::PlatformManager::PlatformWithName(
Expand All @@ -63,7 +65,8 @@ class FactoryTest : public xla::HloHardwareIndependentTestBase,
.value()),
compiler_(xla::Compiler::GetForPlatform(platform_->id()).value()),
stream_executor_(platform_->ExecutorForDevice(0).value()),
target_config_(stream_executor_) {}
target_config_(stream_executor_),
allocator_(stream_executor_) {}
};

TEST_P(FactoryTest, GetCodegenBackends) {
Expand All @@ -81,9 +84,9 @@ TEST_P(FactoryTest, GetCodegenBackends) {
AliasInfo alias_info;
xla::RegisterSymbolicExprStorage(&mlir_context);
std::vector<std::unique_ptr<CodegenBackend>> backends =
get_codegen_backends(stream_executor_, &debug_options_, compiler_.get(),
&target_config_, &alias_info, &mlir_context,
GetParam().names);
get_codegen_backends(stream_executor_, &allocator_, &debug_options_,
compiler_.get(), &target_config_, &alias_info,
&mlir_context, GetParam().names);
EXPECT_EQ(backends.size(), GetParam().expected_num_backends);
} else {
GTEST_SKIP() << "Skipping test for platform " << platform_->id();
Expand Down
36 changes: 17 additions & 19 deletions xla/backends/gpu/autotuner/miopen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ limitations under the License.
#include "xla/stream_executor/scratch_allocator.h"
#include "xla/stream_executor/stream.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_memory_allocator.h"
#include "xla/tsl/platform/errors.h"
#include "xla/tsl/platform/statusor.h"
#include "xla/tsl/protobuf/dnn.pb.h"
Expand Down Expand Up @@ -80,9 +79,8 @@ bool IsCustomCallToDnnFusedConvolution(const HloInstruction& hlo) {
absl::Status ApplyConfigAndUpdateWorkspaceInOutputTuple(
HloInstruction& instr, const MIOpenBackendConfig& config) {
HloComputation* computation = instr.parent();
std::vector<Shape> new_call_element_shapes;
absl::InlinedVector<Shape, 2> new_call_element_shapes;
// Add the shapes of the outputs of the convolution.
new_call_element_shapes.reserve(instr.shape().tuple_shapes().size() - 1);
for (int i = 0; i < instr.shape().tuple_shapes().size() - 1; ++i) {
new_call_element_shapes.emplace_back(instr.shape().tuple_shapes(i));
}
Expand All @@ -102,8 +100,7 @@ absl::Status ApplyConfigAndUpdateWorkspaceInOutputTuple(
*cudnn_conv_config->mutable_algorithm() = config;
TF_RETURN_IF_ERROR(new_call->set_backend_config(gpu_backend_config));

std::vector<HloInstruction*> new_tuple_elements;
new_tuple_elements.reserve(new_call->shape().tuple_shapes().size() - 1);
absl::InlinedVector<HloInstruction*, 2> new_tuple_elements;
for (int i = 0; i < new_call->shape().tuple_shapes().size() - 1; ++i) {
new_tuple_elements.emplace_back(
computation->AddInstruction(HloInstruction::CreateGetTupleElement(
Expand Down Expand Up @@ -268,6 +265,7 @@ absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
const HloModule* module,
se::StreamExecutor* stream_executor,
se::DeviceAddressAllocator* allocator,
se::Stream* stream) {
CHECK(instr->custom_call_target() != kCudnnConvForwardGraphCallTarget);
ASSIGN_OR_RETURN(GpuConvConfig gpu_conv_config, GetGpuConvConfig(instr));
Expand All @@ -279,10 +277,10 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
se::dnn::DataType output_type,
GetDNNDataTypeFromPrimitiveType(gpu_conv_config.output_type));
se::dnn::DnnSupport* dnn = stream_executor->AsDnn();
se::StreamExecutorMemoryAllocator allocator(stream_executor);
std::unique_ptr<se::Stream> owned_stream;
if (stream == nullptr) {
TF_ASSIGN_OR_RETURN(stream,
allocator.GetStream(stream_executor->device_ordinal()));
TF_ASSIGN_OR_RETURN(owned_stream, stream_executor->CreateStream());
stream = owned_stream.get();
}
bool allow_tf32 = absl::c_all_of(
instr->precision_config().operand_precision(),
Expand All @@ -291,8 +289,8 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
allow_tf32,
/*require_command_buffer=*/false};

se::OwningScratchAllocator<> scratch_allocator(
stream_executor->device_ordinal(), &allocator);
se::OwningScratchAllocator<4> scratch_allocator(
stream_executor->device_ordinal(), allocator);

const auto initialize_buffer = [stream](se::DeviceAddressBase buffer) {
// Although we don't have evidence this matters, zero out the buffers
Expand All @@ -302,18 +300,16 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
return stream->MemZero(&buffer, buffer.size());
};

std::vector<se::DeviceAddressBase> operand_buffers;
operand_buffers.reserve(instr->operand_count());
absl::InlinedVector<se::DeviceAddressBase, 2> operand_buffers;
for (const auto* operand : instr->operands()) {
ASSIGN_OR_RETURN(auto buffer, scratch_allocator.AllocateBytes(
ShapeUtil::ByteSizeOf(operand->shape())));
RETURN_IF_ERROR(initialize_buffer(buffer));
operand_buffers.push_back(buffer);
}

std::vector<se::DeviceAddressBase> result_buffers;
size_t result_buffers_count = instr->shape().tuple_shapes().size();
result_buffers.reserve(result_buffers_count);
absl::InlinedVector<se::DeviceAddressBase, 1> result_buffers;
size_t result_buffers_count = instr->shape().tuple_shapes().size() - 1;
for (int i = 0; i < result_buffers_count; ++i) {
ASSIGN_OR_RETURN(auto buffer,
scratch_allocator.AllocateBytes(ShapeUtil::ByteSizeOf(
Expand Down Expand Up @@ -351,7 +347,8 @@ GetConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
GetFusedConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,
const HloModule* module,
se::StreamExecutor* stream_executor) {
se::StreamExecutor* stream_executor,
se::DeviceAddressAllocator* allocator) {
ASSIGN_OR_RETURN(GpuConvConfig gpu_conv_config, GetGpuConvConfig(instr));
ASSIGN_OR_RETURN(se::dnn::DataType input_type,
GetDNNDataTypeFromPrimitiveType(gpu_conv_config.input_type));
Expand Down Expand Up @@ -409,7 +406,7 @@ GetFusedConvolutionCustomCallConfigs(const HloCustomCallInstruction* instr,

return GetConvolutionCustomCallConfigs(
static_cast<HloCustomCallInstruction*>(new_conv.get()), module,
stream_executor, owned_stream.get());
stream_executor, allocator, owned_stream.get());
}

absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
Expand All @@ -418,7 +415,8 @@ MIOpenBackend::GetSupportedConfigs(const HloInstruction& instr) {
auto custom_call_instr = Cast<HloCustomCallInstruction>(&instr);
if (IsCustomCallToDnnFusedConvolution(*custom_call_instr)) {
return GetFusedConvolutionCustomCallConfigs(
custom_call_instr, custom_call_instr->GetModule(), stream_executor());
custom_call_instr, custom_call_instr->GetModule(), stream_executor(),
allocator_);
}

if (do_not_autotune_) {
Expand All @@ -430,7 +428,7 @@ MIOpenBackend::GetSupportedConfigs(const HloInstruction& instr) {

return GetConvolutionCustomCallConfigs(
custom_call_instr, custom_call_instr->GetModule(), stream_executor(),
/* stream */ nullptr);
allocator_, /* stream */ nullptr);
}
return std::vector<std::unique_ptr<BackendConfig>>();
}
Expand Down
8 changes: 6 additions & 2 deletions xla/backends/gpu/autotuner/miopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ limitations under the License.
#include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
#include "xla/hlo/ir/hlo_instruction.h"
#include "xla/service/compiler.h"
#include "xla/stream_executor/device_address_allocator.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/xla.pb.h"

Expand All @@ -37,10 +38,12 @@ class MIOpenBackend : public GpuCodegenBackend {
public:
explicit MIOpenBackend(stream_executor::StreamExecutor* stream_executor,
const DebugOptions* debug_options, Compiler* compiler,
const Compiler::GpuTargetConfig* target_config)
const Compiler::GpuTargetConfig* target_config,
stream_executor::DeviceAddressAllocator* allocator)
: GpuCodegenBackend(autotuner::Backend::MIOPEN, debug_options, compiler,
target_config, stream_executor),
do_not_autotune_(debug_options->xla_gpu_autotune_level() == 0) {}
do_not_autotune_(debug_options->xla_gpu_autotune_level() == 0),
allocator_(allocator) {}

absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
GetSupportedConfigs(const HloInstruction& instr) override;
Expand All @@ -54,6 +57,7 @@ class MIOpenBackend : public GpuCodegenBackend {
private:
bool IsSupported(const HloInstruction& instr) override;
bool do_not_autotune_;
stream_executor::DeviceAddressAllocator* allocator_;
};

} // namespace gpu
Expand Down
5 changes: 4 additions & 1 deletion xla/backends/gpu/autotuner/miopen_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ limitations under the License.
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/rocm/rocm_platform_id.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_memory_allocator.h"
#include "xla/tsl/lib/core/status_test_util.h"
#include "xla/tsl/platform/statusor.h"
#include "xla/tsl/protobuf/dnn.pb.h"
Expand Down Expand Up @@ -77,6 +78,7 @@ class MIOpenBackendTest : public HloHardwareIndependentTestBase {
AMDGPUCompiler compiler_;
se::StreamExecutor* stream_executor_;
Compiler::GpuTargetConfig target_config_;
se::StreamExecutorMemoryAllocator allocator_;
MIOpenBackend backend_;

MIOpenBackendTest()
Expand All @@ -85,13 +87,14 @@ class MIOpenBackendTest : public HloHardwareIndependentTestBase {
->ExecutorForDevice(0)
.value()),
target_config_(stream_executor_),
allocator_(stream_executor_),
backend_(
stream_executor_,
[](auto& opts) {
opts.set_xla_gpu_autotune_level(1);
return &opts;
}(debug_options_),
&compiler_, &target_config_) {}
&compiler_, &target_config_, &allocator_) {}

bool IsRocm() {
return stream_executor_->GetPlatform()->id() == se::rocm::kROCmPlatformId;
Expand Down
11 changes: 6 additions & 5 deletions xla/service/gpu/gpu_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3225,8 +3225,8 @@ absl::Status GpuCompiler::AddConvAndGemmAutotuningPass(
HloCostAnalysis::ShapeSizeFunction shape_size_fn) {
TF_ASSIGN_OR_RETURN(
std::vector<std::unique_ptr<CodegenBackend>> backends,
GetAutotunerBackends(stream_exec, target_config, alias_info,
debug_options, mlir_context));
GetAutotunerBackends(stream_exec, options.device_allocator, target_config,
alias_info, debug_options, mlir_context));

bool do_not_autotune_cublas =
debug_options.xla_gpu_experimental_disable_binary_libraries() ||
Expand Down Expand Up @@ -3282,6 +3282,7 @@ absl::Status GpuCompiler::AddConvAndGemmAutotuningPass(
absl::StatusOr<std::vector<std::unique_ptr<CodegenBackend>>>
GpuCompiler::GetAutotunerBackends(
se::StreamExecutor* stream_exec,
se::DeviceAddressAllocator* device_allocator,
const Compiler::GpuTargetConfig* target_config, const AliasInfo* alias_info,
const DebugOptions& debug_options, mlir::MLIRContext* mlir_context) {
std::vector<autotuner::Backend> autotune_backends;
Expand Down Expand Up @@ -3328,9 +3329,9 @@ GpuCompiler::GetAutotunerBackends(
auto& registry = stream_executor::PlatformObjectRegistry::GetGlobalRegistry();
TF_ASSIGN_OR_RETURN(const GetCodegenBackends::Type& get_codegen_backends,
registry.FindObject<GetCodegenBackends>(PlatformId()));
std::vector<std::unique_ptr<CodegenBackend>> backends =
get_codegen_backends(stream_exec, &debug_options, this, target_config,
alias_info, mlir_context, autotune_backends);
std::vector<std::unique_ptr<CodegenBackend>> backends = get_codegen_backends(
stream_exec, device_allocator, &debug_options, this, target_config,
alias_info, mlir_context, autotune_backends);
return backends;
}

Expand Down
2 changes: 2 additions & 0 deletions xla/service/gpu/gpu_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ limitations under the License.
#include "xla/service/hlo_cost_analysis.h"
#include "xla/service/hlo_module_config.h"
#include "xla/service/llvm_compiler.h"
#include "xla/stream_executor/device_address_allocator.h"
#include "xla/stream_executor/device_description.h"
#include "xla/stream_executor/device_description.pb.h"
#include "xla/stream_executor/dnn.h"
Expand Down Expand Up @@ -150,6 +151,7 @@ class GpuCompiler : public LLVMCompiler {

absl::StatusOr<std::vector<std::unique_ptr<CodegenBackend>>>
GetAutotunerBackends(se::StreamExecutor* stream_exec,
se::DeviceAddressAllocator* device_allocator,
const Compiler::GpuTargetConfig* target_config,
const AliasInfo* alias_info,
const DebugOptions& debug_options,
Expand Down
Loading