HILA
Loading...
Searching...
No Matches
timing.cpp
1
2#include <time.h>
3#include <chrono>
4
5#include "defs.h"
6
7//////////////////////////////////////////////////////////////////
8// Time related routines (runtime - timing - timelimit)
9// Check timing.h for details
10//////////////////////////////////////////////////////////////////
11
12#include "com_mpi.h"
13
14// these includes need to be outside namespace hila
15#include <csignal>
16#include <cstring>
17
18namespace hila {
19
20// This stores the start time of the program
21static double start_time = -1.0;
22
23/////////////////////////////////////////////////////////////////
24/// Timer routines - for high-resolution event timing
25/////////////////////////////////////////////////////////////////
26
27// store all timers in use
28std::vector<timer *> timer_list = {};
29
30// initialize timer to this timepoint
31void timer::init(const char *tag) {
32 if (tag != nullptr)
33 label = tag;
34 reset();
35 // Store it on 1st use!
36}
37
38// remove the timer also from the list
39void timer::remove() {
40 for (auto it = timer_list.begin(); it != timer_list.end(); ++it) {
41 if (*it == this) {
42 timer_list.erase(it);
43 return;
44 }
45 }
46}
47
48void timer::reset() {
49 t_start = t_total = 0.0;
50 count = 0;
51 is_on = is_error = false;
52}
53
54void timer::error() {
55 if (!is_error) {
56 hila::out0 << " **** Timer '" << label
57 << "' error, unbalanced start/stop. Removing from statistics\n";
58 }
59 is_error = true;
60}
61
62double timer::start() {
63 if (is_on)
64 error();
65 is_on = true;
66
67 // Move storing the timer ptr here, because if timer is initialized
68 // in the global scope the timer_list is possibly initialized later!
69 if (count == 0) {
70 timer_list.push_back(this);
71 }
72
73#ifdef GPU_SYNCHRONIZE_TIMERS
74 gpuStreamSynchronize(0);
75#endif
76
77 t_start = hila::gettime();
78 return t_start;
79}
80
81double timer::stop() {
82 if (!is_on)
83 error();
84 is_on = false;
85
86#ifdef GPU_SYNCHRONIZE_TIMERS
87 gpuStreamSynchronize(0);
88#endif
89
90 double e = hila::gettime();
91 t_total += (e - t_start);
92 count++;
93 return e;
94}
95
96timer_value timer::value() {
97 timer_value r;
98 r.time = t_total;
99 r.count = count;
100 return r;
101}
102
103void timer::report(bool print_not_timed) {
104 if (hila::myrank() == 0) {
105 char line[202];
106
107 // time used during the counter activity
108 double ttime = gettime();
109 if (count > 0 && !is_error) {
110 if (t_total / count > 0.1) {
111 std::snprintf(line, 200, "%-20s: %14.5f %14ld %12.5f s %9.6f\n", label.c_str(),
112 t_total, (long)count, t_total / count, t_total / ttime);
113 } else if (t_total / count > 1e-4) {
114 std::snprintf(line, 200, "%-20s: %14.5f %14ld %12.5f ms %9.6f\n", label.c_str(),
115 t_total, (long)count, 1e3 * t_total / count, t_total / ttime);
116 } else {
117 std::snprintf(line, 200, "%-20s: %14.5f %14ld %12.5f μs %9.6f\n", label.c_str(),
118 t_total, (long)count, 1e6 * t_total / count, t_total / ttime);
119 }
120 hila::out << line;
121 } else if (!is_error && print_not_timed) {
122 std::snprintf(line, 200, "%-20s: no timed calls made\n", label.c_str());
123 hila::out << line;
124 } else if (is_error) {
125 std::snprintf(line, 200, "%-20s: error:unbalanced start/stop\n", label.c_str());
126 hila::out << line;
127 }
128 }
129}
130
131void report_timers() {
132 if (hila::myrank() == 0) {
133 if (timer_list.size() > 0) {
134
135#if defined(CUDA) || defined(HIP)
136#if defined(GPU_SYNCHRONIZE_TIMERS)
137 hila::out << "TIMERS: synchronized to GPU kernel execution (GPU_SYNCHRONIZE_TIMERS "
138 "defined)\n";
139#else
140 hila::out << "TIMERS: GPU_SYNCHRONIZE_TIMERS not defined, fine-grained timing "
141 "likely to be incorrect\n";
142#endif
143#endif
144
145 hila::out << "TIMER REPORT: total(sec) calls "
146 "time/call fraction\n";
147 hila::out << "------------------------------------------------------------"
148 "-----------------\n";
149
150 for (auto tp : timer_list) {
151 tp->report();
152 }
153
154 hila::out << "------------------------------------------------------------"
155 "-----------------\n";
156 } else {
157 hila::out << "No timers defined\n";
158 }
159 }
160}
161
162/////////////////////////////////////////////////////////////////
163/// Use clock_gettime() to get the accurate time
164/// (alternative: use gettimeofday() or MPI_Wtime())
165/// gettime returns the time in secs since program start
166
167double gettime() {
168 struct timespec tp;
169
170 if (start_time == -1.0)
171 inittime();
172
173 clock_gettime(CLOCK_MONOTONIC, &tp);
174 return (((double)tp.tv_sec - start_time) + 1.0e-9 * (double)tp.tv_nsec);
175}
176
177void inittime() {
178 if (start_time == -1.0) {
179 start_time = 0.0;
180 start_time = gettime();
181 }
182}
183
184//////////////////////////////////////////////////////////////////
185/// Routines for checking remaining cpu-time
186/// void setup_timelimit(): set the time limit to watch
187/// bool time_to_finish(): is called periodically on a point where exit can be done.
188/// It uses the max of the time intervals for the estimate for one further round.
189/// If not enough time returns true, else false
190///
191
192static double timelimit = 0;
193
194///
195/// Setup time limit with seconds
196
197void setup_timelimit(const double secs) {
198 timelimit = secs;
199 hila::broadcast(timelimit);
200 hila::out0 << "Time limit is " << timelimit << " seconds\n";
201}
202
203///
204/// setup time limit from the time given in timestr
205/// Format is d-h:m:s, not required to be "normalized" to std ranges
206/// Fields can be unused from the largest fields onwards, i.e. simplest case only seconds
207/// string "slurm" indicates that we call slurm 'squeue' to obtain the time limit
208
209void setup_timelimit(const std::string &timestr) {
210 //
211 constexpr int timelimit_buf_size = 100;
212
213 int status = 0;
214 if (hila::myrank() == 0) {
215 const char *str = timestr.c_str();
216 char buf[timelimit_buf_size];
217
218 if (timestr == "slurm") {
219
220 const char cmd[] = "squeue -h --job ${SLURM_JOB_ID} -O TimeLeft";
221 std::FILE *fp = popen(cmd, "r");
222
223 if (fp && fgets(buf, timelimit_buf_size - 1, fp)) {
224 buf[timelimit_buf_size - 1] = 0;
225 // zero extra spaces and lf at the end of the buf
226 for (int i = std::strlen(buf) - 1; i >= 0 && std::isspace(buf[i]); i--)
227 buf[i] = 0;
228
229 str = buf;
230 hila::out0 << "Got time limit with command '" << cmd << '\n';
231 } else {
232 hila::out0 << "COULD NOT GET TIME FROM squeue COMMAND\n";
233 status = -1; // exit the program
234 }
235 pclose(fp);
236 }
237
238 if (status == 0) {
239
240 unsigned d{0}, h{0}, m{0}, s{0};
241 // use short circuiting of || here to stop parsing on 1st match
242 // zeroing the incorrectly read time variables in the same chain
243 int nargs = 5;
244 if (std::sscanf(str, "%u-%u:%u:%u", &d, &h, &m, &s) == --nargs ||
245 std::sscanf(str, "%u:%u:%u", &h, &m, &s) == --nargs ||
246 std::sscanf(str, "%u:%u", &m, &s) == --nargs ||
247 std::sscanf(str, "%u", &s) == --nargs) {
248
249 if (nargs < 4) d = 0;
250 if (nargs < 3) h = 0;
251 if (nargs < 2) m = 0;
252
253 timelimit = s + 60.0 * (m + 60.0 * (h + 24.0 * d));
254 hila::out0 << "Time limit is " << str << " = " << timelimit << " seconds\n";
255
256 } else {
257 hila::out0 << "INVALID TIMELIMIT -t ARGUMENT " << str << '\n';
258 status = -1; // exit the program
259 }
260 }
261 }
262 hila::broadcast(status);
263 if (status == -1)
265
266 hila::broadcast(timelimit);
267}
268
269
270//////////////////////////////////////////////////////////////////////////////////////////////////
271/// Set up signal handling - signal SIGUSR1 causes the time_to_finish function to return
272/// true. Function hila::setup_signal_handler(); is called at the beginning of the program.
273/// Function hila::signal_status() returns signal value (!=0) if signal has been received.
274/// Signal is _not_ broadcast across nodes
275///
276/// Could also trap SIGTERM, but that would mean programs which do not handle the signal
277/// would not be killable by ctrl-C
278///
279
280static volatile std::sig_atomic_t received_signal;
281
282void signal_handler(int signal) {
283 received_signal = signal;
284}
285
286void setup_signal_handler() {
287 // std::signal(SIGTERM, signal_handler);
288 std::signal(SIGUSR1, signal_handler);
289}
290
291int signal_status() {
292 return received_signal;
293}
294
295//////////////////////////////////////////////////////////////////////////////////////////////////
296/// Check the cpu time limit or if signal SIGUSR1 is up - this function is meant to be called
297/// periodically, and returns true if it is time to exit or signal has been raised.
298///
299/// Use case: on main loop check 'hila::time_to_finish()' periodically, and if it returns true
300/// checkpoint/exit.
301///
302/// This makes sense only if the program can checkpoint or otherwise clean up.
303///
304/// Time limit or signaling are alternative ways to ensure clean exit. Both can be used
305/// at the same time. Time limit has the advantage that if the program does periodic
306/// checkpointing, it automatically adjusts the grace time to allow for checkpointing
307/// at the end. With signal the grace time must be estimated in advance.
308///
309/// Time limit can be given with program command line argument
310/// -t <time> or -t slurm
311/// or calling function hila::setup_timelimit(time), see above
312///
313/// Signal can be set in slurm submit script with
314/// #SBATCH --signal=SIGUSR1@180
315/// where the last number is the time in seconds when the signal SIGUSR1 is sent before
316/// the run time expires. This time must allow for the periodic check interval and the time
317/// the cleanup takes.
318///
319/// Signal can also be sent to slurm jobs from terminal session with
320/// $ scancel --signal=SIGUSR1 <jobid>
321/// and, of course, for normal "terminal" runs with
322/// $ kill -s SIGUSR1 <pid0>
323/// Note: signal must be sent to MPI rank 0. It can be sent to all ranks too.
324///
325
326
328 static double max_interval = 0.0;
329 static double previous_time = 0.0;
330 bool finish;
331
332 // is signal up?
333 int signal = signal_status();
334
335 if (hila::myrank() == 0) {
336 if (signal != 0) {
337 finish = true;
338 hila::out0 << "FINISH UP ON SIGNAL SIGUSR1\n";
339
340 } else if (timelimit == 0.0) {
341 // no signal nor time limit set
342 finish = false;
343
344 } else {
345
346 double this_time = gettime();
347 if (this_time - previous_time > max_interval)
348 max_interval = this_time - previous_time;
349 previous_time = this_time;
350
351 // Give 2 min margin for the exit - perhaps needed for writing etc.
352 if (timelimit - this_time < max_interval + 2 * 60.0)
353 finish = true;
354 else
355 finish = false;
356
357 // hila::out << "TIMECHECK: " << this_time << "s used, " << timelimit - this_time
358 // << "s remaining\n";
359
360 if (finish)
361 hila::out << "CPU TIME LIMIT, EXITING THE PROGRAM\n";
362 }
363 }
364 hila::broadcast(finish);
365 return finish;
366}
367
368/*****************************************************
369 * Time stamp
370 */
371
372void timestamp(const char *msg) {
373 if (hila::myrank() == 0) {
374 int p = hila::out0.precision();
375 std::time_t ct = std::time(NULL);
376 if (msg != NULL)
377 hila::out << msg;
378 std::string d = ctime(&ct);
379 d.resize(d.size() - 1); // take away \n at the end
380 hila::out0 << " -- date " << d << " run time " << std::setprecision(4) << hila::gettime()
381 << "s" << std::endl;
382 hila::out0.precision(p);
383 hila::out0.flush();
384 }
385}
386
387void timestamp(const std::string &msg) {
388 hila::timestamp(msg.c_str());
389}
390
391} // namespace hila
This file defines all includes for HILA.
Implement hila::swap for gauge fields.
Definition array.h:981
void setup_timelimit(const double secs)
Setup time limit with seconds.
Definition timing.cpp:197
int myrank()
rank of this node
Definition com_mpi.cpp:234
std::ostream out
this is our default output file stream
std::ostream out0
This writes output only from main process (node 0)
T broadcast(T &var, int rank=0)
Broadcast the value of var to all MPI ranks from rank (default=0).
Definition com_mpi.h:168
double gettime()
Definition timing.cpp:167
std::vector< timer * > timer_list
Timer routines - for high-resolution event timing.
Definition timing.cpp:28
void terminate(int status)
bool time_to_finish()
Definition timing.cpp:327