-
Notifications
You must be signed in to change notification settings - Fork 0
/
Function.m
210 lines (185 loc) · 8.49 KB
/
Function.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
// go:build darwin
// +build darwin
// The process in this file largely follows the structure detailed in
// https://developer.apple.com/documentation/metal/performing_calculations_on_a_gpu.
#include "Cache.h"
#include "Error.h"
#import <Metal/Metal.h>
extern id<MTLDevice> device;
// Structure of various metal resources needed to execute a computational
// process on the GPU. We have to bundle this in a header that cgo doesn't
// import because of a bug in LLVM that leads to a compilation error of "struct
// size calculation error off=8 bytesize=0".
typedef struct {
id<MTLFunction> function;
id<MTLComputePipelineState> pipeline;
id<MTLCommandQueue> commandQueue;
} _function;
// Set up a new pipeline for executing the specified function in the provided
// MTL code on the default GPU. This returns an Id that must be used to run the
// function. This should be called only once for every function. If any error is
// encountered initializing the metal function, this returns 0 and sets an error
// message in error.
int function_new(const char *metalCode, const char *funcName,
const char **error) {
if (strlen(metalCode) == 0) {
logError(error, @"Missing metal code");
return 0;
}
if (strlen(funcName) == 0) {
logError(error, @"Missing function name");
return 0;
}
// Set up a new function object to hold the various resources for the
// pipeline.
_function *function = malloc(sizeof(_function));
if (function == nil) {
logError(error, @"Failed to initialize function");
return 0;
}
// Create a new library of metal code, which will be used to get a
// reference to the function we want to run on the GPU. Normnally, we
// would use newDefaultLibrary here to automatically create a library from
// all the .metal files in this package. However, because cgo doesn't have
// that functionality, we need to use newLibraryWithSource:options:error
// instead and supply the code to the new library directly.
NSError *libraryError = nil;
id<MTLLibrary> library =
[device newLibraryWithSource:[NSString stringWithUTF8String:metalCode]
options:[MTLCompileOptions new]
error:&libraryError];
if (library == nil) {
logError(error, [NSString stringWithFormat:@"Failed to create library: %@",
libraryError]);
return 0;
}
// Get a reference to the function in the code that's now in the new library.
// (Note that this is not executable yet. We need a pipeline in order to run
// this function.)
function->function =
[library newFunctionWithName:[NSString stringWithUTF8String:funcName]];
if (function->function == nil) {
logError(error, [NSString stringWithFormat:@"Failed to find function '%s'",
funcName]);
return 0;
}
// Convert the function object we just created into a pipeline so we can
// run the function. A pipeline contains the actual instructions/steps
// that the GPU uses to execute the code.
NSError *pipelineError = nil;
function->pipeline =
[device newComputePipelineStateWithFunction:function->function
error:&pipelineError];
if (function->pipeline == nil) {
logError(error, [NSString stringWithFormat:@"Failed to create pipeline: %@",
pipelineError]);
return 0;
}
// Set up a command queue. This is what sends the work to the GPU.
function->commandQueue = [device newCommandQueue];
if (function->commandQueue == nil) {
logError(error, @"Failed to set up command queue");
return 0;
}
// Save the function for later use and return an Id referencing it.
return cache_cache(function, error);
}
// Execute the computational process on the GPU. Each buffer is supplied as an
// argument to the metal code in the same order as the buffer Ids here. This is
// not thread-safe. If any error is encountered running the metal function, this
// returns false and sets an error message in error.
_Bool function_run(int functionId, int width, int height, int depth,
float *inputs, int numInputs, int *bufferIds,
int numBufferIds, const char **error) {
// Fetch the function from the cache.
_function *function = cache_retrieve(functionId, error);
if (function == nil) {
logError(error, @"Failed to retrieve function");
return false;
}
// Create a command buffer from the command queue in the pipeline. This will
// hold the processing commands and move through the queue to the GPU.
id<MTLCommandBuffer> commandBuffer = [function->commandQueue commandBuffer];
if (commandBuffer == nil) {
logError(error, @"Failed to set up command buffer");
return false;
}
// Set up an encoder to write the (compute pass) commands and parameters to
// the command buffer we just created.
id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
if (encoder == nil) {
logError(error, @"Failed to set up compute encoder");
return false;
}
// Set the pipeline that the encoder will use.
[encoder setComputePipelineState:function->pipeline];
// Set the arguments that will be passed to the function. The indexes for the
// arguments here need to match their order in the function declaration. We'll
// start with the arguments that are static values, and then we'll add the
// buffers. We currently support using only the entire buffer without any
// offsets, which could be used to, say, use one part of a buffer for one
// function argument and the other part for a different argument.
int index = 0;
for (int i = 0; i < numInputs; i++) {
// Add the static argument bytes to the encoder at the appropriate index.
[encoder setBytes:&inputs[i] length:sizeof(float) atIndex:index++];
}
for (int i = 0; i < numBufferIds; i++) {
// Retrieve the buffer for this Id.
id<MTLBuffer> buffer = cache_retrieve(bufferIds[i], error);
if (buffer == nil) {
logError(error,
[NSString stringWithFormat:@"Failed to retrieve buffer %d/%d",
i + 1, numBufferIds]);
return false;
}
// Add the buffer to the encoder at the appropriate index.
[encoder setBuffer:buffer offset:0 atIndex:index++];
}
// Specify how many threads we need to perform all the calculations (one
// thread per calculation).
MTLSize gridSize = MTLSizeMake(width, height, depth);
// Figure out how many threads will be grouped together into each threadgroup.
// There are two variables that are important here:
//
// pipeline.threadExecutionWidth:
// Maximum number of threads that the GPU can execute at one time in
// parallel (aka thread warp size, aka SIMD group size)
// pipeline.maxTotalThreadsPerThreadgroup:
// Maximum number of threads that can be bundled together into a
// threadgroup
//
// We're going to divide the threads conceptually into two dimensions and then
// place them into a 3-dimensional grid with no height. The first dimension
// will be the number of threads that can run at one time (the thread warp
// size). The second dimension will be the maximum number of parallel thread
// bundles.
//
// For more details on threads, grids, and threadgroup sizes, see
// https://developer.apple.com/documentation/metal/compute_passes/calculating_threadgroup_and_grid_sizes.
NSUInteger w = function->pipeline.threadExecutionWidth;
NSUInteger h = function->pipeline.maxTotalThreadsPerThreadgroup / w;
MTLSize threadgroupSize = MTLSizeMake(w, h, 1);
// Set the grid into the encoder. (With this method, we don't need to
// calculate the number of threadgroups for the grid.)
[encoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
// Mark that we're done encoding the buffer and can proceed with executing the
// function.
[encoder endEncoding];
// Commit the command buffer to the command queue so that it gets picked up
// and run on the GPU, and then wait for the calculations to finish.
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
return true;
}
// Get the name of the metal function with the provided function Id, or nil on
// error.
const char *function_name(int functionId) {
// Fetch the function from the cache.
_function *function = cache_retrieve(functionId, nil);
if (function == nil) {
logError(nil, @"Failed to retrieve function");
return nil;
}
return [[function->function name] UTF8String];
}