HILA
Loading...
Searching...
No Matches
memory_pool2.cpp
1///////////////////////////////////////////
2/// gpu_malloc.cpp - simple list-based alloc program for cuda/hip
3
4#include "plumbing/defs.h"
5#include "plumbing/lattice.h"
6#include "plumbing/field.h"
7#include "plumbing/backend_gpu/defs.h"
8#include <list>
9#include <iomanip>
10
11///////////////////////////////////////////////////////////////////////
12/// GPU memory manager
13/// Allocates a slab of memory, which it then splits out in blocks as requested.
14/// On free, merges adjoining free blocks to a larger block
15/// Optimized for relatively large allocations and roughly "fifo"-type
16/// alloc/free cycle.
17///
18/// Allocates first GPU_MEMORY_POOL_FRACTION part of the GPU memory
19/// If this becomes full, tries to allocate half of the remaining memory
20/// NOTE: do not allocate all of memory on GPU, for example FFTs need
21/// own memory
22///
23/// Implements 3 doubly linked lists:
24/// memory blocks allocated (ordered, all blocks are here)
25/// free memory blocks
26/// in use memory blocks
27///////////////////////////////////////////////////////////////////////
28
29#if defined(GPU_MEMORY_POOL)
30
31#ifndef GPU_MEMORY_POOL_FRACTION
32#define GPU_MEMORY_POOL_FRACTION 0.2
33#endif
34
35#if !defined(CUDA) && !defined(HIP)
36static_assert(0 && "HIP or CUDA must be defined");
37#endif
38
39// keep relatively large min allocation
40#define ALLOC_ALIGNMENT 256
41
42// Use doubly 2-directional linked list to keep track
43struct block {
44 void *ptr;
45 block *next, *prev;
46 block *up, *down;
47 size_t size;
48 bool is_free;
49};
50
51static size_t max_used_size = 0;
52static size_t current_used_size = 0;
53static size_t total_pool_size = 0;
54static size_t n_allocs = 0;
55static size_t free_list_size;
56static size_t gpu_total_memory = 0;
57
58static double free_list_avg_size = 0;
59static double free_list_avg_search = 0;
60
61static block *memory_blocks; // keeps track of all of memory
62static block *free_blocks;
63static block *in_use_blocks;
64
65static block *unused_blocks;
66
67//////////////////////////////////////////////////////////////////////////
68// Memory block descriptor manager
69// make some memory alloc tracking blocks - last.up == nullptr
70block *alloc_more_block_descriptors() {
71 block *p;
72 int n_blocks = 1000;
73
74 p = (block *)memalloc(n_blocks * sizeof(block));
75
76 for (int i = 0; i < n_blocks - 1; i++) {
77 p[i].up = &p[i + 1];
78 }
79
80 p[n_blocks - 1].up = nullptr;
81
82 return p;
83}
84
85block *get_block_descriptor() {
86 if (unused_blocks == nullptr) {
87 unused_blocks = alloc_more_block_descriptors();
88 }
89 block *ret = unused_blocks;
90 unused_blocks = unused_blocks->up;
91 ret->ptr = nullptr;
92 ret->up = ret->down = ret->next = ret->prev = nullptr;
93 return ret;
94}
95
96void release_block_descriptor(block *p) {
97 p->up = unused_blocks;
98 unused_blocks = p;
99}
100
101/////////////////////////////////////////////////////////////////////////
102// Alloc memory slabs
103/////////////////////////////////////////////////////////////////////////
104void *gpu_memory_allocate(size_t m_alloc) {
105 const int kb = 1024;
106 const int mb = kb * kb;
107
108 // ensure size is multiple of alignment
109 if (m_alloc % ALLOC_ALIGNMENT != 0)
110 m_alloc = m_alloc - m_alloc % ALLOC_ALIGNMENT + ALLOC_ALIGNMENT;
111
112 double fraction = (double)m_alloc / gpu_total_memory;
113
114 hila::out0 << "GPU memory: allocating " << m_alloc / mb << " MB out of total "
115 << gpu_total_memory / mb << "(" << (int)(fraction * 100) << "%)\n";
116 total_pool_size += m_alloc;
117
118 void *b = nullptr;
119
120#ifndef HILAPP
121#if defined(CUDA)
122 GPU_CHECK(cudaMalloc(&b, m_alloc));
123#elif defined(HIP)
124 GPU_CHECK(hipMalloc(&b, m_alloc));
125#endif
126#endif
127
128 return b;
129}
130
131/////////////////////////////////////////////////////////////////////////
132// Init memory; allocate the slab
133/////////////////////////////////////////////////////////////////////////
134void gpu_memory_pool_init() {
135
136#ifndef HILAPP
137#if defined(CUDA)
138 cudaDeviceProp props;
139 int my_device;
140 GPU_CHECK(cudaGetDevice(&my_device));
141 GPU_CHECK(cudaGetDeviceProperties(&props, my_device));
142#elif defined(HIP)
143 hipDeviceProp_t props;
144 int my_device;
145 GPU_CHECK(hipGetDevice(&my_device));
146 GPU_CHECK(hipGetDeviceProperties(&props, my_device));
147#endif
148
149 gpu_total_memory = props.totalGlobalMem;
150#endif // HILAPP
151
152 size_t m_alloc = gpu_total_memory * GPU_MEMORY_POOL_FRACTION;
153 // ensure size is multiple of alignment
154 m_alloc = m_alloc - m_alloc % ALLOC_ALIGNMENT + ALLOC_ALIGNMENT;
155
156 block *b = get_block_descriptor();
157 memory_blocks = b;
158
159 b->ptr = gpu_memory_allocate(m_alloc);
160
161 b->up = b->down = b->next = b->prev = nullptr;
162 b->size = m_alloc;
163 b->is_free = true;
164
165 // one huge block free
166 free_blocks = b;
167 in_use_blocks = nullptr;
168 free_list_size = 1;
169 max_used_size = 0;
170 current_used_size = 0;
171}
172
173//////////////////////////////////////////////////////////////////////
174// Manage free and in-use lists: remove_from_list and
175// insert_to_list_head. New list member is at the head, anticipating
176// reuse soon
177
178void remove_from_list(block *p, block **head) {
179 if (p->next != nullptr)
180 p->next->prev = p->prev;
181 if (p->prev != nullptr)
182 p->prev->next = p->next;
183 else
184 *head = p->next;
185}
186
187void insert_to_list_head(block *p, block **head) {
188 if (*head != nullptr)
189 (*head)->prev = p;
190 p->next = *head;
191 p->prev = nullptr;
192 *head = p;
193}
194
195///////////////////////////////////////////////////////////////////////
196// Manage active block lists:
197
198// Merge block with free block below. If orig block was free, remove
199// from free list
200void merge_block_down_free(block *p) {
201 block *pdown = p->down;
202 pdown->size += p->size;
203 pdown->up = p->up;
204 if (p->up != nullptr)
205 p->up->down = pdown;
206
207 if (p->is_free) {
208 // remove from free list
209 remove_from_list(p, &free_blocks);
210 free_list_size--;
211 }
212
213 release_block_descriptor(p);
214}
215
216// Merge block with free block above.
217void merge_block_up_free(block *p) {
218 block *pup = p->up;
219 pup->size += p->size;
220 pup->ptr = p->ptr; // set the ptr to base
221 pup->down = p->down;
222 if (p->down != nullptr)
223 p->down->up = pup;
224 else
225 memory_blocks = pup;
226
227 if (p->is_free) {
228 remove_from_list(p, &free_blocks);
229 free_list_size--;
230 }
231
232 release_block_descriptor(p);
233}
234
235// split free block, insert new block above p and return it
236// return the upper slice of the free block
237block *split_free_block(block *p, size_t req_size) {
238 block *b = get_block_descriptor();
239 b->up = p->up;
240 b->down = p;
241 p->up = b;
242 if (b->up != nullptr)
243 b->up->down = b;
244
245 // set data pointer as appropriate
246 b->ptr = static_cast<char *>(p->ptr) + p->size - req_size;
247 b->size = req_size;
248 p->size -= req_size;
249
250 b->is_free = false;
251 // Free list pointers remain OK (p is there)
252
253 return b;
254}
255
256// Make memory block free, insert to free list
257void mark_block_free(block *p) {
258 p->is_free = true;
259 insert_to_list_head(p, &free_blocks);
260 free_list_size++;
261}
262
263// Add block to the tail of block list
264void add_block_to_top(block *p) {
265 p->up = nullptr;
266 if (memory_blocks == nullptr) {
267 memory_blocks = p;
268 p->down = nullptr;
269 } else {
270 block *b;
271 for (b = memory_blocks; b->up != nullptr; b = b->up)
272 ;
273 b->up = p;
274 p->down = b;
275 }
276}
277
278/////////////////////////////////////////////////////////////////////
279// Memory allocator; similar interface to cudaMalloc
280/////////////////////////////////////////////////////////////////////
281
282void gpu_memory_pool_alloc(void **ret, size_t req_size) {
283
284 static bool initialized = false;
285
286 if (!initialized) {
287 initialized = true;
288 gpu_memory_pool_init();
289 }
290
291 gpuStreamSynchronize(0);
292
293 // make req_size to be multiple of alignment
294 size_t align_mod = req_size % ALLOC_ALIGNMENT;
295 if (align_mod > 0)
296 req_size = req_size - align_mod + ALLOC_ALIGNMENT;
297
298 // hila::out0 << "REQ SIZE " << req_size << '\n';
299
300 n_allocs++;
301 free_list_avg_size += free_list_size;
302
303 // do we have free stuff? Simple linear search - list should not be too large
304 bool found_match = false;
305 int steps = 0;
306 block *ptr = free_blocks;
307 for (block *p = free_blocks; p != nullptr; p = p->next) {
308 steps++;
309 if (p->size == req_size) {
310 found_match = true; // perfect match, use it
311 ptr = p;
312 break;
313 }
314
315 if (p->size > req_size) {
316 // find smallest free block which is OK
317 if (!found_match || ptr->size > p->size) {
318 ptr = p;
319 }
320 found_match = true;
321 }
322 }
323
324 free_list_avg_search += steps;
325
326 // got it, split a piece out of it
327 if (found_match) {
328 if (ptr->size > req_size) {
329 ptr = split_free_block(ptr, req_size);
330 } else {
331 // now ptr->size == req_size
332 // rm from free list
333 remove_from_list(ptr, &free_blocks);
334 free_list_size--;
335 ptr->is_free = false;
336 }
337
338 // move to in-use list
339 insert_to_list_head(ptr, &in_use_blocks);
340
341 current_used_size += req_size;
342 if (current_used_size > max_used_size)
343 max_used_size = current_used_size;
344
345 *ret = ptr->ptr;
346 return;
347
348 } else {
349 // try to allocate more?
350 if (total_pool_size < (1.0 - 1.5 * GPU_MEMORY_POOL_FRACTION) * gpu_total_memory) {
351 size_t m_alloc = GPU_MEMORY_POOL_FRACTION * gpu_total_memory;
352 m_alloc = (m_alloc > req_size) ? m_alloc : req_size;
353 // leave 10% of total memory
354 if (m_alloc + total_pool_size < 0.9 * gpu_total_memory) {
355 // put an "empty" block as a separator (non-mergeable)
356 block *p = get_block_descriptor();
357 p->size = 0;
358 p->is_free = false;
359 add_block_to_top(p);
360
361 // and new memory block
362 p = get_block_descriptor();
363 p->ptr = gpu_memory_allocate(m_alloc);
364 total_pool_size += m_alloc;
365 p->size = m_alloc;
366 add_block_to_top(p);
367 mark_block_free(p);
368
369 gpu_memory_pool_alloc(ret, req_size);
370 return;
371 }
372 }
373 }
374
375 hila::out << "MPI rank " << hila::myrank() << ": out of memory in GPU pool, request size "
376 << req_size << ", current pool size " << total_pool_size << std::endl;
378}
379
380//////////////////////////////////////////////////////////////////////
381// And release memory. Pointer must be exactly the same!
382//////////////////////////////////////////////////////////////////////
383
384void gpu_memory_pool_free(void *ptr) {
385
386 // search the list for the memory block
387 for (block *f = in_use_blocks; f != nullptr; f = f->next) {
388 if (f->ptr == ptr) {
389 // found the allocation, remove from in_use
390 remove_from_list(f, &in_use_blocks);
391
392 current_used_size -= f->size;
393
394 // Are neighbour blocks also free?
395 block *down = f->down;
396 block *up = f->up;
397 if (down != nullptr && down->is_free) {
398 merge_block_down_free(f);
399 if (up != nullptr && up->is_free) {
400 merge_block_down_free(up);
401 }
402 } else if (up != nullptr && up->is_free) {
403 merge_block_up_free(f);
404 } else {
405 // no merging now
406 mark_block_free(f);
407 }
408
409 return;
410 }
411 }
412
413 // did not find! serious error, quit
414 hila::out << "Memory free error - unknown pointer " << ptr << '\n';
416}
417
418/// Release free memory to the system - avoids extra allocations
419void gpu_memory_pool_purge() {}
420
421void gpu_memory_pool_report() {
422 if (hila::myrank() == 0) {
423 hila::out << "\nGPU Memory pool statistics from node 0:\n";
424 hila::out << " Total pool size " << ((double)total_pool_size) / (1024 * 1024) << " MB\n";
425 hila::out << " # of allocations " << n_allocs << '\n';
426 hila::out << " Average free list search " << free_list_avg_search / n_allocs
427 << " steps\n";
428 hila::out << " Average free list size " << free_list_avg_size / n_allocs << " items\n";
429 hila::out << " Maximum memory use " << max_used_size / (1024 * 1024) << " MB\n\n";
430 }
431}
432
433#endif // GPU_MEMORY_POOL
This file defines all includes for HILA.
This files containts definitions for the Field class and the classes required to define it such as fi...
int myrank()
rank of this node
Definition com_mpi.cpp:235
std::ostream out
this is our default output file stream
std::ostream out0
This writes output only from main process (node 0)
void terminate(int status)