Line data Source code
1 : // Copyright (c) 2016-2025 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/cluck
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 :
20 : // self
21 : //
22 : #include "ticket.h"
23 :
24 : #include "cluckd.h"
25 :
26 :
27 : // cluck
28 : //
29 : #include <cluck/exception.h>
30 : #include <cluck/names.h>
31 :
32 :
33 : // advgetopt
34 : //
35 : #include <advgetopt/validator_integer.h>
36 :
37 :
38 : // snapdev
39 : //
40 : #include <snapdev/hexadecimal_string.h>
41 : #include <snapdev/string_replace_many.h>
42 : #include <snapdev/tokenize_string.h>
43 :
44 :
45 : // snaplogger
46 : //
47 : #include <snaplogger/message.h>
48 :
49 :
50 : // last include
51 : //
52 : #include <snapdev/poison.h>
53 :
54 :
55 :
56 : namespace cluck_daemon
57 : {
58 :
59 :
60 :
61 : /** \class ticket
62 : * \brief Handle the ticket messages.
63 : *
64 : * \section introduction Introduction
65 : *
66 : * This class manages the Leslie Lamport's Bakery Algorithm (1974) lock
67 : * mechanism (a critical section that we can get between any number
68 : * of threads, processes, computers.) Details of this algorithm can
69 : * be found here:
70 : *
71 : * http://en.wikipedia.org/wiki/Lamport's_bakery_algorithm
72 : *
73 : * The algorithm requires:
74 : *
75 : * \li A unique name for each computer (server_name)
76 : * \li A unique number for the process attempting the lock
77 : * (see gettid(2) manual)
78 : * \li A user supplied object name (the name of the lock)
79 : * \li A ticket number (use the largest existing ticket number + 1)
80 : *
81 : * We also include a timeout on any one lock so we can forfeit the
82 : * lock from happening if it cannot be obtained in a minimal amount
83 : * of time. The timeout is specified as an absolute time in the
84 : * future (now + X seconds.) The timeout is given in seconds (a
85 : * standard time_t value).
86 : *
87 : * This class sends various messages to manage the locks.
88 : *
89 : *
90 : * \section bakery_algorithm The Bakery Algorithm Explained
91 : *
92 : * The bakery algorithm is based on the basic idea that a large number
93 : * of customers go to one bakery to buy bread. In order to make sure
94 : * they all are served in the order they come in, they are given a ticket
95 : * with a number. The ticket numbers increase by one for each new customer.
96 : * The person still in line with the smallest ticket number is served next.
97 : * Once served, the ticket is destroyed.
98 : *
99 : * \note
100 : * The ticket numbers can restart at one whenever the queue of customers
101 : * goes empty. Otherwise it only increases. From our usage in Snap, it is
102 : * really rare that the ticket numbers would not quickly be reset,
103 : * especially because we have such numbers on a per object_name basis
104 : * and thus many times the number will actually be one.
105 : *
106 : * On a computer without any synchronization mechanism available (our case)
107 : * two customers may enter the bakery simultaneously (especially since we
108 : * are working with processes that may run on different computers.) This
109 : * means two customers may end up with the exact same ticket number and
110 : * there are no real means to avoid that problem. However, each customer
111 : * is also assigned two unique numbers on creation: its "host number"
112 : * (its server name, we use a string to simplify things) and its process
113 : * number (we actually use gettid() so each thread gets a unique number
114 : * which is an equivalent to a pid_t number for every single thread.)
115 : * These two numbers are used to further order processes and make sure
116 : * we can tell who will get the lock first.
117 : *
118 : * So, the basic bakery algorithm looks like this in C++. This algorithm
119 : * expects memory to be guarded (shared or "volatile"; always visible by
120 : * all threads.) In our case, we send the data over the network to
121 : * all the snaplock processes. This is definitely guarded.
122 : *
123 : * \code
124 : * // declaration and initial values of global variables
125 : * namespace {
126 : * int num_threads = 100;
127 : * std::vector<bool> entering;
128 : * std::vector<uint32_t> tickets;
129 : * }
130 : *
131 : * // initialize the vectors
132 : * void init()
133 : * {
134 : * entering.reserve(num_threads);
135 : * tickets.reserve(num_threads);
136 : * }
137 : *
138 : * // i is a thread "number" (0 to 99)
139 : * void lock(int i)
140 : * {
141 : * // get the next ticket
142 : * entering[i] = true;
143 : * int my_ticket(0);
144 : * for(int j(0); j < num_threads; ++j)
145 : * {
146 : * if(ticket[k] > my_ticket)
147 : * {
148 : * my_ticket = ticket[k];
149 : * }
150 : * }
151 : * ++my_ticket; // add 1, we want the next ticket
152 : * entering[i] = false;
153 : *
154 : * for(int j(0); j < num_threads; ++j)
155 : * {
156 : * // wait until thread j receives its ticket number
157 : * while(entering[j])
158 : * {
159 : * sleep();
160 : * }
161 : *
162 : * // there are several cases:
163 : * //
164 : * // (1) tickets that are 0 are not assigned so we can just go
165 : * // through
166 : * //
167 : * // (2) smaller tickets win over us (have a higher priority,)
168 : * // so if there is another thread with a smaller ticket
169 : * // sleep a little and try again; that ticket must go to
170 : * // zero to let us through that guard
171 : * //
172 : * // (3) if tickets are equal, compare the thread numbers and
173 : * // like the tickets, the smallest thread wins
174 : * //
175 : * while(ticket[j] != 0 && (ticket[j] < ticket[i] || (ticket[j] == ticket[i] && j < i))
176 : * {
177 : * sleep();
178 : * }
179 : * }
180 : * }
181 : *
182 : * // i is the thread number
183 : * void unlock(int i)
184 : * {
185 : * // release our ticket
186 : * ticket[i] = 0;
187 : * }
188 : *
189 : * void SomeThread(int i)
190 : * {
191 : * while(true)
192 : * {
193 : * [...]
194 : * // non-critical section...
195 : * lock(i);
196 : * // The critical section code goes here...
197 : * unlock(i);
198 : * // non-critical section...
199 : * [...]
200 : * }
201 : * }
202 : * \endcode
203 : *
204 : * Note that there are two possible optimizations when actually
205 : * implementing the algorithm:
206 : *
207 : * \li You can enter (entering[i] = true), get your ticket,
208 : * exit (entering[i] = false) and then get the list of
209 : * still existing 'entering' processes. Once that list
210 : * goes empty, we do not need to test the entering[j]
211 : * anymore because any further entering[j] will be about
212 : * processes with a larger ticket number and thus
213 : * processes that will appear later in the list of tickets.
214 : *
215 : * \li By sorting (and they are) our ticket requests by ticket,
216 : * server name, and process pid, we do not have to search
217 : * for the smallest ticket. The smallest ticket is automatically
218 : * first in that list! So all we have to do is: if not first,
219 : * sleep() some more.
220 : *
221 : * \section implementation Our implementation in cluck
222 : *
223 : * Locks are given a name by our users. This is used to lock just
224 : * one small thing for any amount of time as required by your
225 : * implementation.
226 : *
227 : * That name is used as an index to the f_tickets object in the
228 : * snaplock class. Within such a ticket, you have one entry per
229 : * process trying to obtain that lock.
230 : *
231 : * For example, the users plugin generates a unique user identifier
232 : * which is a number starting at 1. When a process needs to do this,
233 : * we need a lock to prevent any other processes to do it at the
234 : * same time. We also use a QUORUM consistency in Cassandra to
235 : * load/increment/save the user number.
236 : *
237 : * In this example, all we need to lock is an object named something
238 : * like "user number". Actually, if the number is specific to a
239 : * website, we can use the website URI. In this case, we can use a
240 : * name like this: "http://www.example.com/user#number". This says
241 : * we are managing an atomic "#number" at address
242 : * "http://www.example.com/user". This also means we do not need
243 : * to block anyone if the other people need to lock a completely
244 : * different field (so process A can lock the user unique number
245 : * while process B could lock an invoice unique number.)
246 : *
247 : * As a result, the locking mechanism manages the locks on a per
248 : * lock name basis. In other words, if only two processes request
249 : * a lock simultaneously and the object_name parameter are not equal,
250 : * they both get their lock instantaneously (at least very quickly.)
251 : *
252 : * \subsection message_sequence Message Sequence Chart
253 : *
254 : * \msc
255 : * Client,SnapLockA,SnapLockB,SnapLockC;
256 : *
257 : * Client->SnapLockA [label="LOCK"];
258 : *
259 : * SnapLockA->SnapLockA [label="LOCK_ENTERING"];
260 : * SnapLockA->SnapLockB [label="LOCK_ENTERING"];
261 : * SnapLockA->SnapLockC [label="LOCK_ENTERING"];
262 : *
263 : * SnapLockA->SnapLockA [label="LOCK_ENTERED"];
264 : * SnapLockB->SnapLockA [label="LOCK_ENTERED"];
265 : * SnapLockC->SnapLockA [label="LOCK_ENTERED"];
266 : *
267 : * SnapLockA->SnapLockA [label="GET_MAX_TICKET"];
268 : * SnapLockA->SnapLockB [label="GET_MAX_TICKET"];
269 : * SnapLockA->SnapLockC [label="GET_MAX_TICKET"];
270 : *
271 : * SnapLockA->SnapLockA [label="MAX_TICKET"];
272 : * SnapLockB->SnapLockA [label="MAX_TICKET"];
273 : * SnapLockC->SnapLockA [label="MAX_TICKET"];
274 : *
275 : * SnapLockA->SnapLockA [label="ADD_TICKET"];
276 : * SnapLockA->SnapLockB [label="ADD_TICKET"];
277 : * SnapLockA->SnapLockC [label="ADD_TICKET"];
278 : *
279 : * SnapLockA->SnapLockA [label="TICKET_ADDED"];
280 : * SnapLockB->SnapLockA [label="TICKET_ADDED"];
281 : * SnapLockC->SnapLockA [label="TICKET_ADDED"];
282 : *
283 : * SnapLockA->SnapLockA [label="LOCK_EXITING"];
284 : * SnapLockA->SnapLockB [label="LOCK_EXITING"];
285 : * SnapLockA->SnapLockC [label="LOCK_EXITING"];
286 : *
287 : * SnapLockA->Client [label="LOCKED"];
288 : * \endmsc
289 : *
290 : *
291 : * \section drawback Any drawback?
292 : *
293 : * \subsection timeouts Timeouts
294 : *
295 : * All our locks come with a timeout. The default is defined in
296 : * CLUCK_LOCK_DURATION_DEFAULT_TIMEOUT, which is 5 seconds.
297 : * (5 seconds, which for a front end hit to a website is very
298 : * long already!) If that timeout is too short (i.e. a backend
299 : * does heavy lifting work on the data), then you can make it
300 : * larger. Our backends are given 4h by default.
301 : *
302 : * \subsection deadlock Deadlock
303 : *
304 : * Like with any lock, if you have two processes that both try
305 : * two distinct locks each in the other order, you get a deadlock:
306 : *
307 : * P1 tries to get L1, and gets it;
308 : *
309 : * P2 tries to get L2, and gets it;
310 : *
311 : * P1 tries to get L2, and has to wait on P2;
312 : *
313 : * P2 tries to get L1, and creates a deadlock.
314 : *
315 : * The deadlock itself will be resolved once a lock times out,
316 : * but P2 will "never" have a chance to work on L1 if that sequence
317 : * always happens.
318 : */
319 :
320 :
321 :
322 :
323 : /** \brief Initialize a ticket object.
324 : *
325 : * The constructor initializes a ticket object by creating a ticket
326 : * key and allocating an entering object.
327 : *
328 : * Once the entering object was acknowledged by QUORUM cluck daemon
329 : * instances (i.e. one other computer since we allow exactly 3 leaders,)
330 : * we can then create the ticket.
331 : *
332 : * \note
333 : * We create a key from the server name, client PID, and object
334 : * name for the entering process to run. This key is unique
335 : * among all computers assuming (1) your client PID is unique and
336 : * (2) your servers all have unique names and both of these conditions
337 : * are always true (i.e. we do not allow a cluckd to join a cluster if
338 : * its name was already registered).
339 : *
340 : * \note
341 : * If you use threads, or are likely to use threads, make sure to
342 : * use the gettid() function instead of getpid() to define a
343 : * unique client PID. (Note: this is done in the cluck library.)
344 : *
345 : * \param[in] c A pointer to the cluckd object.
346 : * \param[in] messenger A pointer to the messenger.
347 : * \param[in] object_name The name of the object getting locked.
348 : * \param[in] tag The tag from the message to distinct different attempts.
349 : * \param[in] entering_key The key (ticket) used to enter the bakery.
350 : * \param[in] obtention_timeout The time when the attempt to get the lock
351 : * times out in seconds.
352 : * \param[in] lock_duration The amount of time the lock lasts once obtained.
353 : * \param[in] server_name The name of the server generating the locked.
354 : * \param[in] service_name The service waiting for the LOCKED message.
355 : */
356 133 : ticket::ticket(
357 : cluckd * c
358 : , messenger::pointer_t messenger
359 : , std::string const & object_name
360 : , ed::dispatcher_match::tag_t tag
361 : , std::string const & entering_key
362 : , cluck::timeout_t obtention_timeout
363 : , cluck::timeout_t lock_duration
364 : , std::string const & server_name
365 133 : , std::string const & service_name)
366 133 : : f_cluckd(c)
367 133 : , f_messenger(messenger)
368 133 : , f_object_name(object_name)
369 133 : , f_tag(tag)
370 133 : , f_obtention_timeout(obtention_timeout)
371 133 : , f_lock_duration(std::clamp(
372 : lock_duration
373 : , cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT
374 : , cluck::CLUCK_MAXIMUM_TIMEOUT))
375 133 : , f_server_name(server_name)
376 133 : , f_service_name(service_name)
377 133 : , f_owner(f_cluckd->get_server_name())
378 532 : , f_entering_key(entering_key)
379 : {
380 133 : set_unlock_duration(f_lock_duration);
381 :
382 : // TODO: see how to not say "attempting a lock" when we are deserializing
383 : // an existing lock.
384 665 : SNAP_LOG_TRACE
385 : << "Attempting to lock \""
386 133 : << f_object_name
387 133 : << "\" ("
388 133 : << f_tag
389 : << ") on \""
390 133 : << f_entering_key
391 : << "\" for \""
392 133 : << f_server_name
393 : << '/'
394 133 : << f_service_name
395 : << "\" (timeout: "
396 : << f_obtention_timeout
397 : << ")."
398 : << SNAP_LOG_SEND;
399 133 : }
400 :
401 :
402 : /** \brief Send a message to the other two leaders.
403 : *
404 : * The \p msg is "broadcast" to the other two leaders.
405 : *
406 : * This is a safe guard so if one of our three leaders fails, we have
407 : * a backup of the lock status.
408 : *
409 : * The locking system also works if there are only two or even just one
410 : * computer. In those cases, special care has to be taken to get things
411 : * to work as expected.
412 : *
413 : * \param[in] msg The message to send to the other two leaders.
414 : *
415 : * \return true if the message was forwarded at least once, false otherwise.
416 : */
417 803 : bool ticket::send_message_to_leaders(ed::message & msg)
418 : {
419 : // finish the message initialization
420 : //
421 2409 : msg.set_service(cluck::g_name_cluck_service_name);
422 2409 : msg.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
423 2409 : msg.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
424 :
425 803 : computer::pointer_t leader(f_cluckd->get_leader_a());
426 803 : if(leader != nullptr)
427 : {
428 : // there are at least two leaders
429 : //
430 784 : int count(0);
431 784 : msg.set_server(leader->get_name());
432 784 : if(f_messenger->send_message(msg))
433 : {
434 784 : ++count;
435 : }
436 :
437 : // check for a third leader
438 : //
439 784 : leader = f_cluckd->get_leader_b();
440 784 : if(leader != nullptr)
441 : {
442 70 : msg.set_server(leader->get_name());
443 70 : if(f_messenger->send_message(msg))
444 : {
445 70 : ++count;
446 : }
447 : }
448 :
449 : // we have to wait for at least one reply if we were able to send
450 : // at least one message
451 : //
452 784 : return count > 0;
453 : }
454 :
455 : // there is only one leader (ourselves)
456 : //
457 : // call the one_leader() function to verify that this is indeed correct
458 : // otherwise we would mess up the algorithm
459 : //
460 19 : return one_leader();
461 803 : }
462 :
463 :
464 : /** \brief Enter the mode that lets us retrieve our ticket number.
465 : *
466 : * In order to make sure we can get the current largest ticket number
467 : * in a unique enough way, cluck has to enter the lock loop. This
468 : * process starts by sending a `LOCK_ENTERING` message to all the
469 : * other cluckd leaders.
470 : */
471 119 : void ticket::entering()
472 : {
473 : // TODO implement the special case when there is only 1 leader
474 : // (on the other hand, that should be rather rare)
475 : //computer::pointer_t leader(f_cluckd->get_leader_a());
476 : //if(leader == nullptr)
477 : //{
478 : // -- do the necessary to obtain the lock --
479 : // return;
480 : //}
481 :
482 119 : ed::message entering_message;
483 357 : entering_message.set_command(cluck::g_name_cluck_cmd_lock_entering);
484 357 : entering_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
485 357 : entering_message.add_parameter(cluck::g_name_cluck_param_timeout, f_obtention_timeout);
486 357 : entering_message.add_parameter(cluck::g_name_cluck_param_duration, f_lock_duration);
487 119 : if(f_lock_duration != f_unlock_duration)
488 : {
489 12 : entering_message.add_parameter(cluck::g_name_cluck_param_unlock_duration, f_unlock_duration);
490 : }
491 357 : entering_message.add_parameter(cluck::g_name_cluck_param_source, f_server_name + "/" + f_service_name);
492 357 : entering_message.add_parameter(cluck::g_name_cluck_param_serial, f_serial);
493 119 : if(send_message_to_leaders(entering_message))
494 : {
495 119 : if(one_leader())
496 : {
497 : // there are no other leaders, make sure the algorithm progresses
498 : //
499 3 : entered();
500 : }
501 : }
502 238 : }
503 :
504 :
505 : /** \brief Tell this entering that we received a LOCKENTERED message.
506 : *
507 : * This function gets called each time we receive a `LOCKENTERED`
508 : * message with this ticket entering key.
509 : *
510 : * Since we have 1 to 3 leaders, the quorum and thus consensus is reached
511 : * as soon as we receive one `LOCKENTERED` message. So as a result this
512 : * function sends `GETMAXTICKET` the first time it gets called. The
513 : * `GETMAXTICKET` message allows us to determine the ticket number for
514 : * the concerned object.
515 : *
516 : * \note
517 : * The msg_lockentered() function first checked whether the
518 : * `LOCKENTERED` message had anything to do with this ticket.
519 : * If not, the message was just ignored.
520 : */
521 126 : void ticket::entered()
522 : {
523 : // is this ticket concerned?
524 : //
525 126 : if(!f_get_max_ticket)
526 : {
527 : // with 2 or 3 leaders, quorum is obtain with one
528 : // single acknowledgement
529 : //
530 116 : f_get_max_ticket = true;
531 :
532 : // calculate this instance max. ticket number
533 : //
534 116 : f_our_ticket = f_cluckd->get_last_ticket(f_object_name);
535 :
536 116 : ed::message get_max_ticket_message;
537 348 : get_max_ticket_message.set_command(cluck::g_name_cluck_cmd_get_max_ticket);
538 348 : get_max_ticket_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
539 116 : if(send_message_to_leaders(get_max_ticket_message))
540 : {
541 116 : if(one_leader())
542 : {
543 : // there are no other leaders, make sure the algorithm progresses
544 : //
545 3 : max_ticket(f_our_ticket);
546 : }
547 : }
548 116 : }
549 126 : }
550 :
551 :
552 : /** \brief Called whenever a MAX_TICKET is received.
553 : *
554 : * This function registers the largest ticket number. Once we reach
555 : * QUORUM, then we have the largest number and we can move on to the
556 : * next stage, which is to add the ticket.
557 : *
558 : * \note
559 : * We reach quorum immediately in our current implementation since we
560 : * have 1, 2, or 3 leaders. So this function takes the input in account
561 : * once, calls add_ticket() immediately and if the 3rd leader does send
562 : * a reply too, it gets ignored.
563 : *
564 : * \param[in] new_max_ticket Another possibly larger ticket.
565 : */
566 128 : void ticket::max_ticket(ticket_id_t new_max_ticket)
567 : {
568 128 : if(!f_added_ticket)
569 : {
570 117 : if(new_max_ticket > f_our_ticket)
571 : {
572 110 : f_our_ticket = new_max_ticket;
573 : }
574 :
575 117 : ++f_our_ticket;
576 117 : if(f_our_ticket == NO_TICKET)
577 : {
578 : // f_out_ticket is a 32 bit number, this can happen only if you
579 : // created over 4 billion locks back to back--i.e. created a new
580 : // one before the previous one was released; or put in a different
581 : // way: the list of tickets with that "object name" never went
582 : // back to being empty for that long...
583 : //
584 6 : throw cluck::out_of_range("ticket::max_ticket() tried to generate the next ticket and got a wrapping around number.");
585 : }
586 :
587 115 : add_ticket();
588 : }
589 126 : }
590 :
591 :
592 : /** \brief Send the ADD_TICKET message.
593 : *
594 : * This function sends the ADD_TICKET message to all the cluckd
595 : * instances currently known.
596 : *
597 : * \exception logic_error
598 : * This exception is raised if the function gets called twice or more.
599 : * Since it is considered an internal function, it should not be an issue.
600 : */
601 115 : void ticket::add_ticket()
602 : {
603 : // we expect exactly one call to this function
604 : //
605 115 : if(f_added_ticket)
606 : {
607 : throw cluck::logic_error("ticket::add_ticket() called more than once."); // LCOV_EXCL_LINE
608 : }
609 115 : f_added_ticket = true;
610 :
611 : //
612 : // WARNING: the ticket key MUST be properly sorted by:
613 : //
614 : // ticket number
615 : // server name
616 : // client pid
617 : //
618 : // The client PID does not need to be sorted numerically, just be sorted
619 : // so one client is before the other.
620 : //
621 : // However, the ticket number MUST be numerically sorted. For this reason,
622 : // since the key is a string, we must add introducing zeroes.
623 : //
624 345 : f_ticket_key = snapdev::int_to_hex(f_our_ticket, false, 8)
625 460 : + '/'
626 345 : + f_entering_key;
627 :
628 115 : f_cluckd->set_ticket(f_object_name, f_ticket_key, shared_from_this());
629 :
630 115 : ed::message add_ticket_message;
631 345 : add_ticket_message.set_command(cluck::g_name_cluck_cmd_add_ticket);
632 345 : add_ticket_message.add_parameter(cluck::g_name_cluck_param_key, f_ticket_key);
633 345 : add_ticket_message.add_parameter(cluck::g_name_cluck_param_timeout, f_obtention_timeout);
634 115 : if(send_message_to_leaders(add_ticket_message))
635 : {
636 115 : if(one_leader())
637 : {
638 3 : ticket_added(f_cluckd->get_entering_tickets(f_object_name));
639 : }
640 : }
641 230 : }
642 :
643 :
644 : /** \brief Called whenever a TICKET_ADDED is received.
645 : *
646 : * This function sends a LOCK_EXITING if the ticket reached the total number
647 : * of TICKET_ADDED required to get a quorum (which is just one with 1 to 3
648 : * leaders.)
649 : *
650 : * The \p still_entering paramater defines the list of tickets that are
651 : * still trying to enter the same object. This is very important. It needs
652 : * to be completely drained before we can proceed and mark the ticket as
653 : * assigned.
654 : *
655 : * \param[in] still_entering The list of still entering processes
656 : */
657 116 : void ticket::ticket_added(key_map_t const & still_entering)
658 : {
659 116 : if(!f_added_ticket_quorum)
660 : {
661 : // when we have 2 or 3 leaders, quorum is obtain with one
662 : // single acknowledgement
663 : //
664 114 : f_added_ticket_quorum = true;
665 :
666 114 : f_still_entering = still_entering;
667 :
668 : // okay, the ticket was added on all cluck daemons
669 : // now we can forget about the entering flag
670 : // (equivalent to setting it to false)
671 : //
672 114 : ed::message exiting_message;
673 342 : exiting_message.set_command(cluck::g_name_cluck_cmd_lock_exiting);
674 342 : exiting_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
675 114 : snapdev::NOT_USED(send_message_to_leaders(exiting_message));
676 :
677 114 : f_cluckd->lock_exiting(exiting_message);
678 114 : }
679 116 : }
680 :
681 :
682 : /** \brief Call any time time an entering flag is reset.
683 : *
684 : * This function gets called whenever an entering flag gets set
685 : * back to false (i.e. removed in our implementation).
686 : *
687 : * This function knows whether this ticket received its number
688 : * and is not yet ready. In both of these circumstances, we
689 : * are waiting for all entering flags that got created while
690 : * we determined the largest ticket number to be removed.
691 : *
692 : * \param[in] key The key of the ticket that was entered.
693 : */
694 10021 : void ticket::remove_entering(std::string const & key)
695 : {
696 10021 : if(f_added_ticket_quorum
697 5068 : && !f_ticket_ready)
698 : {
699 5067 : auto it(f_still_entering.find(key));
700 5067 : if(it != f_still_entering.end())
701 : {
702 5067 : f_still_entering.erase(it);
703 :
704 : // just like the quorum computation, we compute the
705 : // remaining list of entering tickets dynamically at
706 : // the time we check the value
707 : //
708 171721 : for(auto key_entering(f_still_entering.begin()); key_entering != f_still_entering.end(); )
709 : {
710 166654 : if(key_entering->second->timed_out())
711 : {
712 0 : key_entering = f_still_entering.erase(key_entering);
713 : }
714 : else
715 : {
716 166654 : ++key_entering;
717 : }
718 : }
719 :
720 : // once all removed, our ticket is ready!
721 : //
722 5067 : if(f_still_entering.empty())
723 : {
724 114 : f_ticket_ready = true;
725 :
726 : // let the other two leaders know that the ticket is ready
727 : //
728 114 : ed::message ticket_ready_message;
729 342 : ticket_ready_message.set_command(cluck::g_name_cluck_cmd_ticket_ready);
730 342 : ticket_ready_message.add_parameter(cluck::g_name_cluck_param_key, f_ticket_key);
731 114 : snapdev::NOT_USED(send_message_to_leaders(ticket_ready_message));
732 114 : }
733 : }
734 : }
735 10021 : }
736 :
737 :
738 : /** \brief Check whether this ticket can be activated and do so if so.
739 : *
740 : * This function checks whether the ticket is ready to be activated.
741 : * This means it got a ticket and the ticket is ready. If so, then
742 : * it sends the LOCKED message back to the system that required it.
743 : *
744 : * This function can be called multiple times. It will send
745 : * the ACTIVATE_LOCK message only once.
746 : *
747 : * On a system with only one computer, it will also send the LOCKED
748 : * message immediately.
749 : */
750 217 : void ticket::activate_lock()
751 : {
752 217 : if(f_ticket_ready
753 115 : && !f_locked
754 114 : && f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
755 : {
756 114 : ed::message activate_lock_message;
757 342 : activate_lock_message.set_command(cluck::g_name_cluck_cmd_activate_lock);
758 342 : activate_lock_message.add_parameter(cluck::g_name_cluck_param_key, f_ticket_key);
759 114 : if(send_message_to_leaders(activate_lock_message))
760 : {
761 114 : if(one_leader())
762 : {
763 3 : lock_activated();
764 : }
765 : }
766 114 : }
767 217 : }
768 :
769 :
770 : /** \brief Check whether this ticket can be activated and do so if so.
771 : *
772 : * This function checks whether the ticket is ready to be activated.
773 : * This means it got a ticket and the ticket is ready. If so, then
774 : * it sends the LOCKED message back to the system that required it.
775 : *
776 : * This function can be called multiple times. It will send
777 : * the LOCKED message only once.
778 : */
779 134 : void ticket::lock_activated()
780 : {
781 134 : if(f_ticket_ready
782 134 : && !f_locked
783 114 : && f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
784 : {
785 114 : f_locked = true;
786 114 : f_lock_timeout_date = snapdev::now() + f_lock_duration;
787 114 : f_unlocked_timeout_date = f_lock_timeout_date + f_unlock_duration;
788 :
789 114 : if(f_owner == f_cluckd->get_server_name())
790 : {
791 113 : ed::message locked_message;
792 339 : locked_message.set_command(cluck::g_name_cluck_cmd_locked);
793 113 : locked_message.set_server(f_server_name);
794 113 : locked_message.set_service(f_service_name);
795 339 : locked_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
796 339 : locked_message.add_parameter(cluck::g_name_cluck_param_timeout_date, f_lock_timeout_date);
797 339 : locked_message.add_parameter(cluck::g_name_cluck_param_unlocked_date, f_unlocked_timeout_date);
798 339 : locked_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
799 113 : f_messenger->send_message(locked_message);
800 113 : }
801 : }
802 134 : }
803 :
804 :
805 : /** \brief We are done with the ticket.
806 : *
807 : * This function sends the DROP_TICKET message to get rid of a ticket
808 : * from another leader's list of tickets.
809 : *
810 : * Another leader has a list of tickets as it receives LOCK and ADDTICKET
811 : * messages.
812 : */
813 111 : void ticket::drop_ticket()
814 : {
815 333 : SNAP_LOG_TRACE
816 : << "Unlock on \""
817 111 : << f_object_name
818 111 : << "\" ("
819 111 : << f_tag
820 : << ") with key \""
821 111 : << f_entering_key
822 : << "\"."
823 : << SNAP_LOG_SEND;
824 :
825 111 : ed::message drop_ticket_message;
826 333 : drop_ticket_message.set_command(cluck::g_name_cluck_cmd_drop_ticket);
827 333 : drop_ticket_message.add_parameter(
828 : cluck::g_name_cluck_param_key
829 111 : , f_ticket_key.empty() ? f_entering_key : f_ticket_key);
830 111 : send_message_to_leaders(drop_ticket_message);
831 :
832 111 : if(f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
833 : {
834 109 : f_lock_failed = lock_failure_t::LOCK_FAILURE_UNLOCKING;
835 :
836 : //if(f_owner == f_cluckd->get_server_name()) -- this can happen with any leader so we have to send the UNLOCKED
837 : // the other leaders won't call this function they receive DROP_TICKET
838 : // instead and as mentioned in the TODO below, we should get a QUORUM
839 : // instead...
840 : {
841 : // we can immediately say it got unlocked...
842 : //
843 : // TODO: this is true ONLY if you lock the same object no more than
844 : // once within a session, which is not unlikely false (it is
845 : // true for what I can remember of Snap!, but long term this
846 : // is not safe.) Like the LOCK, we need a quorum and then
847 : // send the UNLOCK... At this point, I'm not too sure how
848 : // we implement such because the drop_ticket function ends
849 : // up deleting the ticket from memory and thus no counting
850 : // can happen after that... (i.e. we need a special case
851 : // of the receiver for the UNLOCK, argh!)
852 : //
853 109 : ed::message unlocked_message;
854 327 : unlocked_message.set_command(cluck::g_name_cluck_cmd_unlocked);
855 109 : unlocked_message.set_server(f_server_name);
856 109 : unlocked_message.set_service(f_service_name);
857 327 : unlocked_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
858 327 : unlocked_message.add_parameter(cluck::g_name_cluck_param_unlocked_date, snapdev::now());
859 327 : unlocked_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
860 109 : f_messenger->send_message(unlocked_message);
861 109 : }
862 : }
863 222 : }
864 :
865 :
866 : /** \brief Let the service that wanted this lock know that it failed.
867 : *
868 : * This function sends a reply to the server that requested the lock to
869 : * let it know that it somehow failed.
870 : *
871 : * The function replies with a LOCK_FAILED when the lock was never
872 : * obtained. In this case the origin server cannot access the resources.
873 : *
874 : * The function replies with UNLOCKING when the lock timed out. The
875 : * server is expected to send an UNLOCK reply to acknowledge the
876 : * failure and fully release the lock. The lock will remain in place
877 : * until that acknowledgement is received or an amount of time
878 : * equal to the lock duration by default with a minimum of 1 minute.
879 : *
880 : * The UNLOCKING acknowledgement timeout is set to the same amount as
881 : * the LOCK duration if the `unlock_duration` parameter is not specified
882 : * in the LOCK message. When the `unlock_duration` parameter is specified,
883 : * then that amount is used instead.
884 : *
885 : * \note
886 : * The function may get called multiple times. The failure message
887 : * is sent only on the first call.
888 : *
889 : * \note
890 : * If the ticket was created on another cluck daemon (not the one that
891 : * received the LOCK event in the first place) then this ticket is
892 : * not marked as being owned by this cluck daemon and as a result this
893 : * function only marks the ticket as failed.
894 : *
895 : * \param[in] reason A reason for the failure (i.e. "timed out")
896 : */
897 14 : void ticket::lock_failed(std::string const & reason)
898 : {
899 : enum send_msg_t
900 : {
901 : SEND_MSG_NONE,
902 : SEND_MSG_UNLOCKING,
903 : SEND_MSG_UNLOCKED,
904 : SEND_MSG_FAILED,
905 : };
906 :
907 14 : send_msg_t send(SEND_MSG_NONE);
908 :
909 14 : switch(f_lock_failed)
910 : {
911 7 : case lock_failure_t::LOCK_FAILURE_NONE:
912 : // send that message at most once
913 : //
914 7 : f_lock_failed = lock_failure_t::LOCK_FAILURE_LOCK;
915 :
916 7 : if(f_locked)
917 : {
918 : // now we have to extend the lock timeout to make sure that
919 : // the UNLOCKING has a chance to be acknowledged
920 : //
921 5 : f_lock_timeout_date += f_unlock_duration;
922 5 : if(timed_out())
923 : {
924 : // this case is logical here, but I don't think it can
925 : // happen because the f_locked is true and thus the only
926 : // value we can use is f_lock_timeout_date and we just
927 : // increased that value by at least 3 seconds
928 : //
929 : send = SEND_MSG_UNLOCKED; // LCOV_EXCL_LINE
930 : }
931 : else
932 : {
933 5 : send = SEND_MSG_UNLOCKING;
934 : }
935 : }
936 : else
937 : {
938 2 : send = SEND_MSG_FAILED;
939 : }
940 7 : break;
941 :
942 5 : case lock_failure_t::LOCK_FAILURE_LOCK:
943 5 : f_lock_failed = lock_failure_t::LOCK_FAILURE_UNLOCKING;
944 :
945 5 : if(f_locked)
946 : {
947 3 : send = SEND_MSG_UNLOCKED;
948 : }
949 5 : break;
950 :
951 2 : case lock_failure_t::LOCK_FAILURE_UNLOCKING:
952 : // we already sent all the possible messages
953 2 : break;
954 :
955 : }
956 :
957 : // we want the f_lock_failed and f_lock_timeout_date set before returning
958 : //
959 14 : if(f_owner != f_cluckd->get_server_name())
960 : {
961 2 : return;
962 : }
963 :
964 12 : switch(send)
965 : {
966 3 : case SEND_MSG_NONE:
967 : // don't send another message
968 3 : break;
969 :
970 5 : case SEND_MSG_UNLOCKING:
971 : {
972 : // if we were locked and reach here, then the lock
973 : // timed out while locked but the unlock timeout was
974 : // not yet reached so just send an UNLOCKING message
975 : //
976 15 : SNAP_LOG_IMPORTANT
977 : << "Lock on \""
978 5 : << f_object_name
979 5 : << "\" ("
980 5 : << f_tag
981 : << ") with key \""
982 5 : << f_entering_key
983 : << "\" timed out its lock allowed time."
984 : << SNAP_LOG_SEND;
985 :
986 5 : ed::message lock_failed_message;
987 15 : lock_failed_message.set_command(cluck::g_name_cluck_cmd_unlocking);
988 5 : lock_failed_message.set_server(f_server_name);
989 5 : lock_failed_message.set_service(f_service_name);
990 15 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
991 15 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
992 25 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_error, cluck::g_name_cluck_value_timedout);
993 5 : f_messenger->send_message(lock_failed_message);
994 5 : }
995 : break;
996 :
997 3 : case SEND_MSG_UNLOCKED:
998 : {
999 : // if we were locked and/or unlocking and we reach here,
1000 : // then the lock completely timed out and we immediately
1001 : // completely unlock with an UNLOCKED message
1002 : //
1003 : // IMPORTANT: that means the service should stop using the
1004 : // shared resources but there is absoltely no
1005 : // guarantee about that; however, this situation
1006 : // should only occur when a service somehow does
1007 : // not properly UNLOCK its lock
1008 : //
1009 9 : SNAP_LOG_IMPORTANT
1010 : << "Lock on \""
1011 3 : << f_object_name
1012 3 : << "\" ("
1013 3 : << f_tag
1014 : << ") with key \""
1015 3 : << f_entering_key
1016 : << "\" timed out its unlocking allowed time."
1017 : << SNAP_LOG_SEND;
1018 :
1019 3 : ed::message lock_failed_message;
1020 9 : lock_failed_message.set_command(cluck::g_name_cluck_cmd_unlocked);
1021 3 : lock_failed_message.set_server(f_server_name);
1022 3 : lock_failed_message.set_service(f_service_name);
1023 9 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
1024 9 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
1025 15 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_error, cluck::g_name_cluck_value_timedout);
1026 3 : f_messenger->send_message(lock_failed_message);
1027 3 : }
1028 : break;
1029 :
1030 1 : case SEND_MSG_FAILED:
1031 : {
1032 3 : SNAP_LOG_IMPORTANT
1033 : << "Lock on \""
1034 1 : << f_object_name
1035 1 : << "\" ("
1036 1 : << f_tag
1037 : << ") with key \""
1038 1 : << f_entering_key
1039 : << "\" failed."
1040 : << SNAP_LOG_SEND;
1041 :
1042 1 : ed::message lock_failed_message;
1043 3 : lock_failed_message.set_command(cluck::g_name_cluck_cmd_lock_failed);
1044 1 : lock_failed_message.set_server(f_server_name);
1045 1 : lock_failed_message.set_service(f_service_name);
1046 3 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
1047 3 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
1048 3 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
1049 5 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_error, cluck::g_name_cluck_value_failed);
1050 4 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_description,
1051 : "ticket failed before or after the lock was obtained ("
1052 2 : + reason
1053 4 : + ")");
1054 1 : f_messenger->send_message(lock_failed_message);
1055 1 : }
1056 : break;
1057 :
1058 : }
1059 : }
1060 :
1061 :
1062 : /** \brief Define whether this ticket is the owner of that lock.
1063 : *
1064 : * Whenever comes time to send the LOCK, UNLOCK, or LOCK_FAILED messages,
1065 : * only the owner is expected to send it. This flag tells us who the
1066 : * owner is and thus who is responsible for sending that message.
1067 : *
1068 : * \todo
1069 : * The ownership has to travel to others whenever a leader disappears.
1070 : *
1071 : * \param[in] owner The name of this ticket owner.
1072 : */
1073 3 : void ticket::set_owner(std::string const & owner)
1074 : {
1075 3 : f_owner = owner;
1076 3 : }
1077 :
1078 :
1079 : /** \brief Return the name of this ticket's owner.
1080 : *
1081 : * This function returns the name of the owner of this ticket. When a
1082 : * leader dies out, its name stick around until a new leader gets
1083 : * assigned to it.
1084 : *
1085 : * The owner is actually the name of the sending server. So if leader 1
1086 : * is named "alfred" and it sends a ticket message (i.e. LOCK_ENTERING),
1087 : * then the ticket owner parameter will be set "alfred".
1088 : *
1089 : * The owner name is set when you create a ticket or by unserializing
1090 : * a ticket dump. Serialization is used to share tickets between
1091 : * cluck daemon when we lose a leader and a new computer becomes a
1092 : * new leader.
1093 : *
1094 : * \return The name of this ticket owner.
1095 : */
1096 9 : std::string const & ticket::get_owner() const
1097 : {
1098 9 : return f_owner;
1099 : }
1100 :
1101 :
1102 : /** \brief Retrieve the client process identifier.
1103 : *
1104 : * This function splits the entering key and return the process identifier.
1105 : * This is primarily used to resend a LOCK message since in most cases
1106 : * this information should not be required.
1107 : *
1108 : * \note
1109 : * This is not really information that the ticket is supposed to know about
1110 : * but well... there is now a case where we need to know this.
1111 : *
1112 : * \return The process identifier of this ticket owner.
1113 : */
1114 3 : pid_t ticket::get_client_pid() const
1115 : {
1116 3 : std::vector<std::string> segments;
1117 9 : if(snapdev::tokenize_string(segments, f_entering_key, "/") != 2)
1118 : {
1119 : throw cluck::invalid_parameter(
1120 : "ticket::get_client_pid() split f_entering_key \""
1121 2 : + f_entering_key
1122 3 : + "\" and did not get exactly two segments.");
1123 : }
1124 2 : std::int64_t value;
1125 2 : advgetopt::validator_integer::convert_string(segments[1], value);
1126 2 : return static_cast<pid_t>(value);
1127 3 : }
1128 :
1129 :
1130 : /** \brief Give the lock a serial number for some form of unicity.
1131 : *
1132 : * When we lose a leader, the unicity of the ticket may be required as we
1133 : * start sharing the tickets between the surviving leaders. This is done
1134 : * for the RELOCK message which attempts to restart the an old LOCK. In
1135 : * that case, two leaders end up attempt a RELOCK on the same ticket.
1136 : * To make sure that we can easily ignore the second attempt, we use
1137 : * the serial number to see that the exact same message is getting there
1138 : * twice.
1139 : *
1140 : * The cluck daemon uses the leader number as part of the serial
1141 : * number (bits 24 and 25) so it is unique among all the instances,
1142 : * at least until a cluck deamon dies and its unique numbers get
1143 : * mingled (and the old leaders may change their own number too...)
1144 : *
1145 : * \param[in] serial The serial number of the ticket.
1146 : */
1147 122 : void ticket::set_serial(serial_t serial)
1148 : {
1149 122 : f_serial = serial;
1150 122 : }
1151 :
1152 :
1153 : /** \brief Return the serial number of this ticket.
1154 : *
1155 : * This function returns the serial number of this ticket. See the
1156 : * set_serial() function for additional information about this number.
1157 : *
1158 : * \return The serial number of the ticket.
1159 : */
1160 4 : ticket::serial_t ticket::get_serial() const
1161 : {
1162 4 : return f_serial;
1163 : }
1164 :
1165 :
1166 : /** \brief Change the unlock duration to the specified value.
1167 : *
1168 : * If the service requesting a lock fails to acknowledge an unlock, then
1169 : * the lock still gets unlocked after this \p duration.
1170 : *
1171 : * By default, this parameter gets set to the same value as duration with
1172 : * a minimum of 3 seconds. When the message includes an `unlock_duration`
1173 : * parameter then that value is used instead.
1174 : *
1175 : * \note
1176 : * If \p duration is less than cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT,
1177 : * then cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT is used. At time of writing
1178 : * cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT is 3 seconds.
1179 : *
1180 : * \warning
1181 : * It is important to understand that as soon as an UNLOCKED event arrives,
1182 : * you should acknowledge it. Not doing so increases the risk that two or
1183 : * more processes access the same resource simultaneously.
1184 : *
1185 : * \param[in] duration The amount of time to acknowledge an UNLOCKED
1186 : * event; after that the lock is released no matter what.
1187 : */
1188 255 : void ticket::set_unlock_duration(cluck::timeout_t duration)
1189 : {
1190 255 : if(duration == cluck::CLUCK_DEFAULT_TIMEOUT)
1191 : {
1192 115 : duration = f_lock_duration;
1193 : }
1194 :
1195 255 : f_unlock_duration = std::clamp(
1196 : duration
1197 : , cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT
1198 255 : , cluck::CLUCK_MAXIMUM_TIMEOUT);
1199 255 : }
1200 :
1201 :
1202 : /** \brief Get unlock duration.
1203 : *
1204 : * The unlock duration is used in case the lock times out. It extends
1205 : * the lock duration for that much longer until the client acknowledge
1206 : * the locks or the lock really times out.
1207 : *
1208 : * \note
1209 : * If not yet set, this function returns zero (a null timestamp).
1210 : *
1211 : * \return The unlock acknowledgement timeout duration.
1212 : */
1213 3 : cluck::timeout_t ticket::get_unlock_duration() const
1214 : {
1215 3 : return f_unlock_duration;
1216 : }
1217 :
1218 :
1219 : /** \brief Mark the ticket as being ready.
1220 : *
1221 : * This ticket is marked as being ready.
1222 : *
1223 : * A ticket is ready when all the entering tickets were removed from it
1224 : * on the owning leader. On the other two leaders, the ticket gets marked
1225 : * as being ready once they receive the LOCKEXITING message.
1226 : */
1227 103 : void ticket::set_ready()
1228 : {
1229 103 : f_ticket_ready = true;
1230 103 : }
1231 :
1232 :
1233 : /** \brief Set the ticket number.
1234 : *
1235 : * The other two leaders receive the ticket number in the ADDTICKET
1236 : * message. That number must be saved in the ticket, somehow. This
1237 : * is the function we use to do that.
1238 : *
1239 : * It is very important to have the correct number (by default it is
1240 : * zero) since the algorithm asks for the maximum ticket number
1241 : * currently available and without that information that request
1242 : * cannot be answered properly.
1243 : *
1244 : * \param[in] number The ticket number to save in f_our_ticket.
1245 : */
1246 6 : void ticket::set_ticket_number(ticket_id_t const number)
1247 : {
1248 6 : if(f_our_ticket != NO_TICKET
1249 4 : || f_added_ticket)
1250 : {
1251 : throw cluck::logic_error("ticket::set_ticket_number() called with "
1252 4 : + std::to_string(number)
1253 8 : + " when f_our_ticket is already set to "
1254 8 : + std::to_string(f_our_ticket)
1255 6 : + ".");
1256 : }
1257 4 : f_added_ticket = true;
1258 :
1259 4 : f_our_ticket = number;
1260 12 : f_ticket_key = snapdev::int_to_hex(f_our_ticket, false, 8)
1261 16 : + '/'
1262 12 : + f_entering_key;
1263 4 : }
1264 :
1265 :
1266 : /** \brief Return the ticket number of this ticket.
1267 : *
1268 : * This function returns the ticket number of this ticket. This
1269 : * is generally used to determine the largest ticket number
1270 : * currently in use in order to attach a new ticket number
1271 : * to a lock object.
1272 : *
1273 : * By default the value is NO_TICKET meaning that no ticket number was
1274 : * yet assigned to that ticket object.
1275 : *
1276 : * \return The current ticket number.
1277 : */
1278 16 : ticket::ticket_id_t ticket::get_ticket_number() const
1279 : {
1280 16 : return f_our_ticket;
1281 : }
1282 :
1283 :
1284 : /** \brief Check whether this ticket is locked or not.
1285 : *
1286 : * This function returns true if the ticket is currently locked.
1287 : *
1288 : * \return true when the ticket was successfully locked at some point.
1289 : */
1290 5 : bool ticket::is_locked() const
1291 : {
1292 5 : return f_locked;
1293 : }
1294 :
1295 :
1296 : /** \brief Check whether the system only has one leader.
1297 : *
1298 : * The function check the number of known leaders. If just one, then it
1299 : * returns true. This is important for our algorithm to work properly
1300 : * in that one specific case.
1301 : *
1302 : * \return true if there is only one leader (i.e. one single computer in
1303 : * your whole cluster).
1304 : */
1305 483 : bool ticket::one_leader() const
1306 : {
1307 483 : return f_cluckd->get_computer_count() == 1;
1308 : }
1309 :
1310 :
1311 : /** \brief Get the obtention timeout date.
1312 : *
1313 : * This function returns the obtention timeout. Note that if the lock
1314 : * was already obtained, then this date may be in the past. You can test
1315 : * that by checking the get_lock_timeout() function first.
1316 : *
1317 : * \return The date when the obtention of the ticket timeouts.
1318 : */
1319 10 : cluck::timeout_t ticket::get_obtention_timeout() const
1320 : {
1321 10 : return f_obtention_timeout;
1322 : }
1323 :
1324 :
1325 : /** \brief Define a time when the ticket times out while waiting.
1326 : *
1327 : * This function defines the time threshold when to timeout this
1328 : * ticket in case a service does not reply to an ALIVE message.
1329 : *
1330 : * Whenever a leader dies, a ticket which is not locked yet may be
1331 : * transferred to another leader. To not attempt to lock a ticket
1332 : * for nothing, the new leader first checks that the service
1333 : * which requested that lock is indeed still alive by send an
1334 : * ALIVE message to it. In return, it expects an ABSOLUTELY
1335 : * reply.
1336 : *
1337 : * If the ABSOLUTELY reply does not make it in time (at this time
1338 : * we limit this to 5 seconds) then we consider that this service
1339 : * is not responsive and we cancel the lock altogether.
1340 : *
1341 : * To cancel this timeout, call the function with cluck::timeout_t()
1342 : * in \p timeout (i.e. zero duration).
1343 : *
1344 : * \note
1345 : * Since that message should happen while the cluck daemon
1346 : * is waiting for the LOCK event, the reply should be close to
1347 : * instantaneous. So 5 seconds is plenty until somehow your
1348 : * network is really busy or really large and the time for
1349 : * the message to travel is too long.
1350 : *
1351 : * \param[in] timeout The time when the ALIVE message times out.
1352 : */
1353 8 : void ticket::set_alive_timeout(cluck::timeout_t timeout)
1354 : {
1355 8 : if(timeout < cluck::timeout_t())
1356 : {
1357 1 : timeout = cluck::timeout_t();
1358 : }
1359 :
1360 8 : if(timeout < f_obtention_timeout)
1361 : {
1362 6 : f_alive_timeout = timeout;
1363 : }
1364 : else
1365 : {
1366 : // use the obtention timeout if smaller because that was the
1367 : // first premise that the client asked about
1368 : //
1369 2 : f_alive_timeout = f_obtention_timeout;
1370 : }
1371 8 : }
1372 :
1373 :
1374 : /** \brief Retrieve the lock duration.
1375 : *
1376 : * This function returns the lock duration in seconds as defined with
1377 : * the constructor.
1378 : *
1379 : * \return The lock duration in seconds.
1380 : */
1381 2 : cluck::timeout_t ticket::get_lock_duration() const
1382 : {
1383 2 : return f_lock_duration;
1384 : }
1385 :
1386 :
1387 : /** \brief Get the lock timeout date.
1388 : *
1389 : * This function returns the lock timeout. If not yet defined, the
1390 : * function will return zero.
1391 : *
1392 : * \note
1393 : * The ticket will immediately be assigned a timeout date when it
1394 : * gets activated.
1395 : *
1396 : * \return The date when the ticket will timeout or zero.
1397 : */
1398 11 : cluck::timeout_t ticket::get_lock_timeout_date() const
1399 : {
1400 11 : return f_lock_timeout_date;
1401 : }
1402 :
1403 :
1404 : /** \brief Get the current lock timeout date.
1405 : *
1406 : * This function returns the "current" lock timeout.
1407 : *
1408 : * The "current" timeout is one of:
1409 : *
1410 : * \li If the lock is being re-requested (after the loss of a leader) then
1411 : * the ALIVE timeout may be returned for a short period of time.
1412 : *
1413 : * \li If the lock was not yet obtained, this function returns the obtention
1414 : * timeout timestamp.
1415 : *
1416 : * \li Once the lock was obtained, the lock timeout gets defined and that
1417 : * one is returned instead.
1418 : *
1419 : * \li When the UNLOCK is received or the timeout happens and cluckd sends
1420 : * the UNLOCKING message, the function returns the unlock timeout. In
1421 : * this case, the \em f_lock_time_date field is still used.
1422 : *
1423 : * \note
1424 : * This is the date used in the timed_out() function.
1425 : *
1426 : * \return The date when the ticket will timeout or zero.
1427 : */
1428 259806 : cluck::timeout_t ticket::get_current_timeout_date() const
1429 : {
1430 259806 : if(f_alive_timeout > cluck::timeout_t())
1431 : {
1432 7 : return f_alive_timeout;
1433 : }
1434 :
1435 259799 : if(f_locked)
1436 : {
1437 451 : return f_lock_timeout_date;
1438 : }
1439 :
1440 259348 : return f_obtention_timeout;
1441 : }
1442 :
1443 :
1444 : /** \brief Check whether this ticket timed out.
1445 : *
1446 : * This function returns true if the ticket timed out in its current
1447 : * state and should be moved to its next state.
1448 : *
1449 : * The function calls the get_current_timeout_date() to select the correct
1450 : * date. This depends on the current state of the ticket (i.e. maybe we
1451 : * sent the ALIVE message and are using the alive time out value).
1452 : *
1453 : * There are five timeout dates that can happen:
1454 : *
1455 : * 1. Time to obtain a lock
1456 : * 2. Time to keep the lock alive
1457 : * 3. Time to wait for a reply after an UNLOCKING message
1458 : * 4. Time to wait for the UNLOCK message
1459 : * 5. Time to wait for the ALIVE reply (i.e. the ABSOLUTELY message)
1460 : *
1461 : * \return true if the ticket timed out in its current state.
1462 : */
1463 222942 : bool ticket::timed_out() const
1464 : {
1465 222942 : return get_current_timeout_date() <= snapdev::now();
1466 : }
1467 :
1468 :
1469 : /** \brief Retrieve the object name of this ticket.
1470 : *
1471 : * This function returns the name of the object associated with this
1472 : * lock (i.e. what is being locked).
1473 : *
1474 : * \return The object name of the ticket.
1475 : */
1476 13 : std::string const & ticket::get_object_name() const
1477 : {
1478 13 : return f_object_name;
1479 : }
1480 :
1481 :
1482 : /** \brief Retrieve the tag of this ticket.
1483 : *
1484 : * This function returns the tag of the object associated with this
1485 : * lock (i.e. the specific instance of the lock being locked).
1486 : *
1487 : * \return The tag associated with this ticket.
1488 : */
1489 2 : ed::dispatcher_match::tag_t ticket::get_tag() const
1490 : {
1491 2 : return f_tag;
1492 : }
1493 :
1494 :
1495 : /** \brief Retrieve the server name of this ticket.
1496 : *
1497 : * This function returns the name of the server associated with this
1498 : * lock, i.e. the server to which the LOCKED and UNLOCKED commands are to
1499 : * be sent back to.
1500 : *
1501 : * This name is also used in case of an error to send the LOCKFAILED back
1502 : * to the service that requested the lock.
1503 : *
1504 : * \return The server name of the ticket.
1505 : */
1506 7 : std::string const & ticket::get_server_name() const
1507 : {
1508 7 : return f_server_name;
1509 : }
1510 :
1511 :
1512 : /** \brief Retrieve the service name of this ticket.
1513 : *
1514 : * This function returns the name of the service associated with this
1515 : * lock. This is the service to which the LOCKED and UNLOCKED messages
1516 : * are sent.
1517 : *
1518 : * This name is also used in case of an error to send the LOCKFAILED back
1519 : * to the service that requested the lock.
1520 : *
1521 : * \return The service name of the ticket.
1522 : */
1523 7 : std::string const & ticket::get_service_name() const
1524 : {
1525 7 : return f_service_name;
1526 : }
1527 :
1528 :
1529 : /** \brief Retrieve a reference to the entering key of this ticket.
1530 : *
1531 : * This function returns the entering key of this ticket. The
1532 : * entering key is defined on instantiation so it is always available.
1533 : *
1534 : * \note
1535 : * By contrast, the ticket key is not available up until the time the
1536 : * ticket number is marked as valid.
1537 : *
1538 : * \return The entering key of this ticket.
1539 : */
1540 130 : std::string const & ticket::get_entering_key() const
1541 : {
1542 130 : return f_entering_key;
1543 : }
1544 :
1545 :
1546 : /** \brief Retrieve a reference to the ticket key.
1547 : *
1548 : * This function returns the ticket key of this ticket. The
1549 : * ticket key is only defined at a later time when the ticket has
1550 : * properly entered the bakery. It includes three parameters:
1551 : *
1552 : * \li Ticket number as a hexadecimal number of 8 digits,
1553 : * \li Server name of the server asking for the lock,
1554 : * \li Process Identifier (PID) of the service daemon asking for the lock.
1555 : *
1556 : * \note
1557 : * This function returns an empty string until the ticket key is available.
1558 : *
1559 : * \return The ticket key.
1560 : */
1561 118 : std::string const & ticket::get_ticket_key() const
1562 : {
1563 118 : return f_ticket_key;
1564 : }
1565 :
1566 :
1567 : /** \brief Serialize a ticket to send it over to another leader.
1568 : *
1569 : * This function serialize a ticket to share it with the other
1570 : * leaders. This is important when a new leader gets elected
1571 : * as it would not otherwise have any idea of what the existing
1572 : * tickets are, although it is not 100% important, if another
1573 : * of the two snaplock was to go down, it becomes primordial
1574 : * for the tickets to be known in the other leaders.
1575 : *
1576 : * This is used at the start before a leader starts accepting new
1577 : * lock requests.
1578 : *
1579 : * \return This ticket as a serialized string.
1580 : *
1581 : * \sa unserialize()
1582 : */
1583 9 : std::string ticket::serialize() const
1584 : {
1585 9 : std::map<std::string, std::string> data;
1586 :
1587 27 : data["object_name"] = f_object_name;
1588 27 : data["tag"] = std::to_string(static_cast<int>(f_tag));
1589 27 : data["obtention_timeout"] = f_obtention_timeout.to_timestamp(true);
1590 : //data["alive_timeout"] = f_alive_timeout.to_timestamp(true); -- we do not want to transfer this one
1591 27 : data["lock_duration"] = f_lock_duration.to_timestamp(true);
1592 27 : data["unlock_duration"] = f_unlock_duration.to_timestamp(true);
1593 27 : data["server_name"] = f_server_name;
1594 27 : data["service_name"] = f_service_name;
1595 27 : data["owner"] = f_owner;
1596 9 : if(f_serial != NO_SERIAL)
1597 : {
1598 15 : data["serial"] = std::to_string(f_serial);
1599 : }
1600 27 : data["entering_key"] = f_entering_key;
1601 27 : data["get_max_ticket"] = f_get_max_ticket ? "true" : "false";
1602 27 : data["our_ticket"] = std::to_string(f_our_ticket);
1603 27 : data["added_ticket"] = f_added_ticket ? "true" : "false";
1604 27 : data["ticket_key"] = f_ticket_key;
1605 27 : data["added_ticket_quorum"] = f_added_ticket_quorum ? "true" : "false";
1606 :
1607 : // this is a map
1608 : //data["still_entering"] = f_still_entering;
1609 : //ticket::key_map_t f_still_entering = key_map_t();
1610 :
1611 27 : data["ticket_ready"] = f_ticket_ready ? "true" : "false";
1612 27 : data["locked"] = f_locked ? "true" : "false";
1613 27 : data["lock_timeout_date"] = f_lock_timeout_date.to_timestamp(true);
1614 :
1615 9 : switch(f_lock_failed)
1616 : {
1617 5 : case lock_failure_t::LOCK_FAILURE_NONE:
1618 15 : data["lock_failed"] = "none";
1619 5 : break;
1620 :
1621 1 : case lock_failure_t::LOCK_FAILURE_LOCK:
1622 3 : data["lock_failed"] = "lock";
1623 1 : break;
1624 :
1625 3 : case lock_failure_t::LOCK_FAILURE_UNLOCKING:
1626 9 : data["lock_failed"] = "unlocking";
1627 3 : break;
1628 :
1629 : }
1630 :
1631 9 : std::string result;
1632 176 : for(auto & it : data)
1633 : {
1634 167 : result += it.first;
1635 167 : result += '=';
1636 : // make sure the value does not include any '|'
1637 668 : result += snapdev::string_replace_many(it.second, {{"|", "%7C"}});
1638 167 : result += '|';
1639 : }
1640 9 : result.pop_back();
1641 :
1642 18 : return result;
1643 176 : }
1644 :
1645 :
1646 : /** \brief Unserialize a ticket string back to a ticket object.
1647 : *
1648 : * This function unserialize a string that was generated using the
1649 : * serialize() function.
1650 : *
1651 : * Note that unknown fields are ignored and none of the fields are
1652 : * considered mandatory. Actually the function generates no errors.
1653 : * This means it should be forward compatible.
1654 : *
1655 : * The data gets unserialized in `this` object.
1656 : *
1657 : * \param[in] data The serialized data.
1658 : */
1659 7 : void ticket::unserialize(std::string const & data)
1660 : {
1661 7 : std::vector<std::string> vars;
1662 21 : snapdev::NOT_USED(snapdev::tokenize_string(vars, data, "|"));
1663 136 : for(auto const & d : vars)
1664 : {
1665 129 : std::string::size_type const pos(d.find('='));
1666 129 : std::string const name(d.substr(0, pos));
1667 129 : std::string const value(d.substr(pos + 1));
1668 129 : switch(name[0])
1669 : {
1670 14 : case 'a':
1671 14 : if(name == "added_ticket")
1672 : {
1673 7 : f_added_ticket = f_added_ticket || value == "true";
1674 : }
1675 7 : else if(name == "added_ticket_quorum")
1676 : {
1677 7 : f_added_ticket_quorum = f_added_ticket_quorum || value == "true";
1678 : }
1679 : //else if(name == "alive_timeout") -- we do not transfer this one (not required, and could actually cause problems)
1680 : //{
1681 : // f_alive_timeout = cluck::timeout_t(value);
1682 : //}
1683 14 : break;
1684 :
1685 7 : case 'e':
1686 7 : if(name == "entering_key")
1687 : {
1688 : #ifdef _DEBUG
1689 7 : if(f_entering_key != value)
1690 : {
1691 : // LCOV_EXCL_START
1692 : throw cluck::logic_error(
1693 : "ticket::unserialize() not unserializing entering key \""
1694 : + value
1695 : + "\" over itself \""
1696 : + f_entering_key
1697 : + "\" (entering key mismatch).");
1698 : // LCOV_EXCL_STOP
1699 : }
1700 : #endif
1701 7 : f_entering_key = value;
1702 : }
1703 7 : break;
1704 :
1705 7 : case 'g':
1706 7 : if(name == "get_max_ticket")
1707 : {
1708 7 : f_get_max_ticket = f_get_max_ticket || value == "true";
1709 : }
1710 7 : break;
1711 :
1712 28 : case 'l':
1713 28 : if(name == "lock_duration")
1714 : {
1715 7 : f_lock_duration = cluck::timeout_t(value);
1716 : }
1717 21 : else if(name == "locked")
1718 : {
1719 7 : f_locked = f_locked || value == "true";
1720 : }
1721 14 : else if(name == "lock_timeout_date")
1722 : {
1723 : // the time may be larger because of an UNLOCK so we keep
1724 : // the largest value
1725 : //
1726 7 : cluck::timeout_t const timeout_date(value);
1727 7 : if(timeout_date > f_lock_timeout_date)
1728 : {
1729 1 : f_lock_timeout_date = timeout_date;
1730 : }
1731 : }
1732 7 : else if(name == "lock_failed")
1733 : {
1734 : // in this case, we avoid reducing the error level
1735 : //
1736 7 : if(value == "unlocking")
1737 : {
1738 3 : f_lock_failed = lock_failure_t::LOCK_FAILURE_UNLOCKING;
1739 : }
1740 4 : else if(value == "lock" && f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
1741 : {
1742 1 : f_lock_failed = lock_failure_t::LOCK_FAILURE_LOCK;
1743 : }
1744 : }
1745 28 : break;
1746 :
1747 28 : case 'o':
1748 28 : if(name == "object_name")
1749 : {
1750 : #ifdef _DEBUG
1751 7 : if(f_object_name != value)
1752 : {
1753 : // LCOV_EXCL_START
1754 : throw cluck::logic_error(
1755 : "ticket::unserialize() not unserializing object name \""
1756 : + value
1757 : + "\" over itself \""
1758 : + f_object_name
1759 : + "\" (object name mismatch).");
1760 : // LCOV_EXCL_STOP
1761 : }
1762 : #endif
1763 7 : f_object_name = value;
1764 : }
1765 21 : else if(name == "obtention_timeout")
1766 : {
1767 7 : f_obtention_timeout = cluck::timeout_t(value);
1768 : }
1769 14 : else if(name == "owner")
1770 : {
1771 7 : f_owner = value;
1772 : }
1773 7 : else if(name == "our_ticket")
1774 : {
1775 7 : std::int64_t v;
1776 7 : advgetopt::validator_integer::convert_string(value, v);
1777 7 : f_our_ticket = v;
1778 : }
1779 28 : break;
1780 :
1781 17 : case 's':
1782 17 : if(name == "server_name")
1783 : {
1784 7 : f_server_name = value;
1785 : }
1786 10 : else if(name == "service_name")
1787 : {
1788 7 : f_service_name = value;
1789 : }
1790 3 : else if(name == "serial")
1791 : {
1792 3 : std::int64_t v;
1793 3 : advgetopt::validator_integer::convert_string(value, v);
1794 3 : f_serial = v;
1795 : }
1796 17 : break;
1797 :
1798 21 : case 't':
1799 21 : if(name == "tag")
1800 : {
1801 7 : std::int64_t v;
1802 7 : advgetopt::validator_integer::convert_string(value, v);
1803 7 : f_tag = v;
1804 : }
1805 14 : else if(name == "ticket_key")
1806 : {
1807 7 : f_ticket_key = value;
1808 : }
1809 7 : else if(name == "ticket_ready")
1810 : {
1811 7 : f_ticket_ready = f_ticket_ready || value == "true";
1812 : }
1813 21 : break;
1814 :
1815 7 : case 'u':
1816 7 : if(name == "unlock_duration")
1817 : {
1818 7 : f_unlock_duration = cluck::timeout_t(value);
1819 : }
1820 7 : break;
1821 :
1822 : }
1823 129 : }
1824 14 : }
1825 :
1826 :
1827 :
1828 : } // namespace cluck_daemon
1829 : // vim: ts=4 sw=4 et
|