Line data Source code
1 : // Copyright (c) 2016-2024 Made to Order Software Corp. All Rights Reserved
2 : //
3 : // https://snapwebsites.org/project/cluck
4 : // contact@m2osw.com
5 : //
6 : // This program is free software: you can redistribute it and/or modify
7 : // it under the terms of the GNU General Public License as published by
8 : // the Free Software Foundation, either version 3 of the License, or
9 : // (at your option) any later version.
10 : //
11 : // This program is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 : //
16 : // You should have received a copy of the GNU General Public License
17 : // along with this program. If not, see <https://www.gnu.org/licenses/>.
18 :
19 :
20 : // self
21 : //
22 : #include "ticket.h"
23 :
24 : #include "cluckd.h"
25 :
26 :
27 : // cluck
28 : //
29 : #include <cluck/exception.h>
30 : #include <cluck/names.h>
31 :
32 :
33 : // advgetopt
34 : //
35 : #include <advgetopt/validator_integer.h>
36 :
37 :
38 : // snapdev
39 : //
40 : #include <snapdev/hexadecimal_string.h>
41 : #include <snapdev/string_replace_many.h>
42 : #include <snapdev/tokenize_string.h>
43 :
44 :
45 : // snaplogger
46 : //
47 : #include <snaplogger/message.h>
48 :
49 :
50 : // last include
51 : //
52 : #include <snapdev/poison.h>
53 :
54 :
55 :
56 : namespace cluck_daemon
57 : {
58 :
59 :
60 :
61 : /** \class ticket
62 : * \brief Handle the ticket messages.
63 : *
64 : * \section introduction Introduction
65 : *
66 : * This class manages the Leslie Lamport's Bakery Algorithm (1974) lock
67 : * mechanism (a critical section that we can get between any number
68 : * of threads, processes, computers.) Details of this algorithm can
69 : * be found here:
70 : *
71 : * http://en.wikipedia.org/wiki/Lamport's_bakery_algorithm
72 : *
73 : * The algorithm requires:
74 : *
75 : * \li A unique name for each computer (server_name)
76 : * \li A unique number for the process attempting the lock
77 : * (see gettid(2) manual)
78 : * \li A user supplied object name (the name of the lock)
79 : * \li A ticket number (use the largest existing ticket number + 1)
80 : *
81 : * We also include a timeout on any one lock so we can forfeit the
82 : * lock from happening if it cannot be obtained in a minimal amount
83 : * of time. The timeout is specified as an absolute time in the
84 : * future (now + X seconds.) The timeout is given in seconds (a
85 : * standard time_t value).
86 : *
87 : * This class sends various messages to manage the locks.
88 : *
89 : *
90 : * \section bakery_algorithm The Bakery Algorithm Explained
91 : *
92 : * The bakery algorithm is based on the basic idea that a large number
93 : * of customers go to one bakery to buy bread. In order to make sure
94 : * they all are served in the order they come in, they are given a ticket
95 : * with a number. The ticket numbers increase by one for each new customer.
96 : * The person still in line with the smallest ticket number is served next.
97 : * Once served, the ticket is destroyed.
98 : *
99 : * \note
100 : * The ticket numbers can restart at one whenever the queue of customers
101 : * goes empty. Otherwise it only increases. From our usage in Snap, it is
102 : * really rare that the ticket numbers would not quickly be reset,
103 : * especially because we have such numbers on a per object_name basis
104 : * and thus many times the number will actually be one.
105 : *
106 : * On a computer without any synchronization mechanism available (our case)
107 : * two customers may enter the bakery simultaneously (especially since we
108 : * are working with processes that may run on different computers.) This
109 : * means two customers may end up with the exact same ticket number and
110 : * there are no real means to avoid that problem. However, each customer
111 : * is also assigned two unique numbers on creation: its "host number"
112 : * (its server name, we use a string to simplify things) and its process
113 : * number (we actually use gettid() so each thread gets a unique number
114 : * which is an equivalent to a pid_t number for every single thread.)
115 : * These two numbers are used to further order processes and make sure
116 : * we can tell who will get the lock first.
117 : *
118 : * So, the basic bakery algorithm looks like this in C++. This algorithm
119 : * expects memory to be guarded (shared or "volatile"; always visible by
120 : * all threads.) In our case, we send the data over the network to
121 : * all the snaplock processes. This is definitely guarded.
122 : *
123 : * \code
124 : * // declaration and initial values of global variables
125 : * namespace {
126 : * int num_threads = 100;
127 : * std::vector<bool> entering;
128 : * std::vector<uint32_t> tickets;
129 : * }
130 : *
131 : * // initialize the vectors
132 : * void init()
133 : * {
134 : * entering.reserve(num_threads);
135 : * tickets.reserve(num_threads);
136 : * }
137 : *
138 : * // i is a thread "number" (0 to 99)
139 : * void lock(int i)
140 : * {
141 : * // get the next ticket
142 : * entering[i] = true;
143 : * int my_ticket(0);
144 : * for(int j(0); j < num_threads; ++j)
145 : * {
146 : * if(ticket[k] > my_ticket)
147 : * {
148 : * my_ticket = ticket[k];
149 : * }
150 : * }
151 : * ++my_ticket; // add 1, we want the next ticket
152 : * entering[i] = false;
153 : *
154 : * for(int j(0); j < num_threads; ++j)
155 : * {
156 : * // wait until thread j receives its ticket number
157 : * while(entering[j])
158 : * {
159 : * sleep();
160 : * }
161 : *
162 : * // there are several cases:
163 : * //
164 : * // (1) tickets that are 0 are not assigned so we can just go
165 : * // through
166 : * //
167 : * // (2) smaller tickets win over us (have a higher priority,)
168 : * // so if there is another thread with a smaller ticket
169 : * // sleep a little and try again; that ticket must go to
170 : * // zero to let us through that guard
171 : * //
172 : * // (3) if tickets are equal, compare the thread numbers and
173 : * // like the tickets, the smallest thread wins
174 : * //
175 : * while(ticket[j] != 0 && (ticket[j] < ticket[i] || (ticket[j] == ticket[i] && j < i))
176 : * {
177 : * sleep();
178 : * }
179 : * }
180 : * }
181 : *
182 : * // i is the thread number
183 : * void unlock(int i)
184 : * {
185 : * // release our ticket
186 : * ticket[i] = 0;
187 : * }
188 : *
189 : * void SomeThread(int i)
190 : * {
191 : * while(true)
192 : * {
193 : * [...]
194 : * // non-critical section...
195 : * lock(i);
196 : * // The critical section code goes here...
197 : * unlock(i);
198 : * // non-critical section...
199 : * [...]
200 : * }
201 : * }
202 : * \endcode
203 : *
204 : * Note that there are two possible optimizations when actually
205 : * implementing the algorithm:
206 : *
207 : * \li You can enter (entering[i] = true), get your ticket,
208 : * exit (entering[i] = false) and then get the list of
209 : * still existing 'entering' processes. Once that list
210 : * goes empty, we do not need to test the entering[j]
211 : * anymore because any further entering[j] will be about
212 : * processes with a larger ticket number and thus
213 : * processes that will appear later in the list of tickets.
214 : *
215 : * \li By sorting (and they are) our ticket requests by ticket,
216 : * server name, and process pid, we do not have to search
217 : * for the smallest ticket. The smallest ticket is automatically
218 : * first in that list! So all we have to do is: if not first,
219 : * sleep() some more.
220 : *
221 : * \section implementation Our implementation in cluck
222 : *
223 : * Locks are given a name by our users. This is used to lock just
224 : * one small thing for any amount of time as required by your
225 : * implementation.
226 : *
227 : * That name is used as an index to the f_tickets object in the
228 : * snaplock class. Within such a ticket, you have one entry per
229 : * process trying to obtain that lock.
230 : *
231 : * For example, the users plugin generates a unique user identifier
232 : * which is a number starting at 1. When a process needs to do this,
233 : * we need a lock to prevent any other processes to do it at the
234 : * same time. We also use a QUORUM consistency in Cassandra to
235 : * load/increment/save the user number.
236 : *
237 : * In this example, all we need to lock is an object named something
238 : * like "user number". Actually, if the number is specific to a
239 : * website, we can use the website URI. In this case, we can use a
240 : * name like this: "http://www.example.com/user#number". This says
241 : * we are managing an atomic "#number" at address
242 : * "http://www.example.com/user". This also means we do not need
243 : * to block anyone if the other people need to lock a completely
244 : * different field (so process A can lock the user unique number
245 : * while process B could lock an invoice unique number.)
246 : *
247 : * As a result, the locking mechanism manages the locks on a per
248 : * lock name basis. In other words, if only two processes request
249 : * a lock simultaneously and the object_name parameter are not equal,
250 : * they both get their lock instantaneously (at least very quickly.)
251 : *
252 : * \subsection message_sequence Message Sequence Chart
253 : *
254 : * \msc
255 : * Client,SnapLockA,SnapLockB,SnapLockC;
256 : *
257 : * Client->SnapLockA [label="LOCK"];
258 : *
259 : * SnapLockA->SnapLockA [label="LOCK_ENTERING"];
260 : * SnapLockA->SnapLockB [label="LOCK_ENTERING"];
261 : * SnapLockA->SnapLockC [label="LOCK_ENTERING"];
262 : *
263 : * SnapLockA->SnapLockA [label="LOCK_ENTERED"];
264 : * SnapLockB->SnapLockA [label="LOCK_ENTERED"];
265 : * SnapLockC->SnapLockA [label="LOCK_ENTERED"];
266 : *
267 : * SnapLockA->SnapLockA [label="GET_MAX_TICKET"];
268 : * SnapLockA->SnapLockB [label="GET_MAX_TICKET"];
269 : * SnapLockA->SnapLockC [label="GET_MAX_TICKET"];
270 : *
271 : * SnapLockA->SnapLockA [label="MAX_TICKET"];
272 : * SnapLockB->SnapLockA [label="MAX_TICKET"];
273 : * SnapLockC->SnapLockA [label="MAX_TICKET"];
274 : *
275 : * SnapLockA->SnapLockA [label="ADD_TICKET"];
276 : * SnapLockA->SnapLockB [label="ADD_TICKET"];
277 : * SnapLockA->SnapLockC [label="ADD_TICKET"];
278 : *
279 : * SnapLockA->SnapLockA [label="TICKET_ADDED"];
280 : * SnapLockB->SnapLockA [label="TICKET_ADDED"];
281 : * SnapLockC->SnapLockA [label="TICKET_ADDED"];
282 : *
283 : * SnapLockA->SnapLockA [label="LOCK_EXITING"];
284 : * SnapLockA->SnapLockB [label="LOCK_EXITING"];
285 : * SnapLockA->SnapLockC [label="LOCK_EXITING"];
286 : *
287 : * SnapLockA->Client [label="LOCKED"];
288 : * \endmsc
289 : *
290 : *
291 : * \section drawback Any drawback?
292 : *
293 : * \subsection timeouts Timeouts
294 : *
295 : * All our locks come with a timeout. The default is defined in
296 : * CLUCK_LOCK_DURATION_DEFAULT_TIMEOUT, which is 5 seconds.
297 : * (5 seconds, which for a front end hit to a website is very
298 : * long already!) If that timeout is too short (i.e. a backend
299 : * does heavy lifting work on the data), then you can make it
300 : * larger. Our backends are given 4h by default.
301 : *
302 : * \subsection deadlock Deadlock
303 : *
304 : * Like with any lock, if you have two processes that both try
305 : * two distinct locks each in the other order, you get a deadlock:
306 : *
307 : * P1 tries to get L1, and gets it;
308 : *
309 : * P2 tries to get L2, and gets it;
310 : *
311 : * P1 tries to get L2, and has to wait on P2;
312 : *
313 : * P2 tries to get L1, and creates a deadlock.
314 : *
315 : * The deadlock itself will be resolved once a lock times out,
316 : * but P2 will "never" have a chance to work on L1 if that sequence
317 : * always happens.
318 : */
319 :
320 :
321 :
322 :
323 : /** \brief Initialize a ticket object.
324 : *
325 : * The constructor initializes a ticket object by creating a ticket
326 : * key and allocating an entering object.
327 : *
328 : * Once the entering object was acknowledged by QUORUM cluck daemon
329 : * instances (i.e. one other computer since we allow exactly 3 leaders,)
330 : * we can then create the ticket.
331 : *
332 : * \note
333 : * We create a key from the server name, client PID, and object
334 : * name for the entering process to run. This key is unique
335 : * among all computers assuming (1) your client PID is unique and
336 : * (2) your servers all have unique names and both of these conditions
337 : * are always true (i.e. we do not allow a cluckd to join a cluster if
338 : * its name was already registered).
339 : *
340 : * \note
341 : * If you use threads, or are likely to use threads, make sure to
342 : * use the gettid() function instead of getpid() to define a
343 : * unique client PID. (Note: this is done in the cluck library.)
344 : *
345 : * \param[in] c A pointer to the cluckd object.
346 : * \param[in] messenger A pointer to the messenger.
347 : * \param[in] object_name The name of the object getting locked.
348 : * \param[in] entering_key The key (ticket) used to entery the bakery.
349 : * \param[in] obtension_timeout The time when the attempt to get the lock
350 : * times out in seconds.
351 : * \param[in] lock_duration The amount of time the lock lasts once obtained.
352 : * \param[in] server_name The name of the server generating the locked.
353 : * \param[in] service_name The service waiting for the LOCKED message.
354 : */
355 133 : ticket::ticket(
356 : cluckd * c
357 : , messenger::pointer_t messenger
358 : , std::string const & object_name
359 : , ed::dispatcher_match::tag_t tag
360 : , std::string const & entering_key
361 : , cluck::timeout_t obtention_timeout
362 : , cluck::timeout_t lock_duration
363 : , std::string const & server_name
364 133 : , std::string const & service_name)
365 133 : : f_cluckd(c)
366 133 : , f_messenger(messenger)
367 133 : , f_object_name(object_name)
368 133 : , f_tag(tag)
369 133 : , f_obtention_timeout(obtention_timeout)
370 133 : , f_lock_duration(std::clamp(
371 : lock_duration
372 : , cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT
373 : , cluck::CLUCK_MAXIMUM_TIMEOUT))
374 133 : , f_server_name(server_name)
375 133 : , f_service_name(service_name)
376 133 : , f_owner(f_cluckd->get_server_name())
377 532 : , f_entering_key(entering_key)
378 : {
379 133 : set_unlock_duration(f_lock_duration);
380 :
381 : // TODO: see how to not say "attempting a lock" when we are deserializing
382 : // an existing lock.
383 532 : SNAP_LOG_TRACE
384 : << "Attempting to lock \""
385 133 : << f_object_name
386 133 : << "\" ("
387 133 : << f_tag
388 : << ") on \""
389 133 : << f_entering_key
390 : << "\" for \""
391 133 : << f_server_name
392 : << '/'
393 133 : << f_service_name
394 : << "\" (timeout: "
395 : << f_obtention_timeout
396 : << ")."
397 : << SNAP_LOG_SEND;
398 133 : }
399 :
400 :
401 : /** \brief Send a message to the other two leaders.
402 : *
403 : * The \p msg is "broadcast" to the other two leaders.
404 : *
405 : * This is a safe guard so if one of our three leaders fails, we have
406 : * a backup of the lock status.
407 : *
408 : * The locking system also works if there are only two or even just one
409 : * computer. In those cases, special care has to be taken to get things
410 : * to work as expected.
411 : *
412 : * \param[in] msg The message to send to the other two leaders.
413 : *
414 : * \return true if the message was forwarded at least once, false otherwise.
415 : */
416 803 : bool ticket::send_message_to_leaders(ed::message & msg)
417 : {
418 : // finish the message initialization
419 : //
420 803 : msg.set_service(cluck::g_name_cluck_service_name);
421 803 : msg.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
422 803 : msg.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
423 :
424 803 : computer::pointer_t leader(f_cluckd->get_leader_a());
425 803 : if(leader != nullptr)
426 : {
427 : // there are at least two leaders
428 : //
429 784 : int count(0);
430 784 : msg.set_server(leader->get_name());
431 784 : if(f_messenger->send_message(msg))
432 : {
433 784 : ++count;
434 : }
435 :
436 : // check for a third leader
437 : //
438 784 : leader = f_cluckd->get_leader_b();
439 784 : if(leader != nullptr)
440 : {
441 70 : msg.set_server(leader->get_name());
442 70 : if(f_messenger->send_message(msg))
443 : {
444 70 : ++count;
445 : }
446 : }
447 :
448 : // we have to wait for at least one reply if we were able to send
449 : // at least one message
450 : //
451 784 : return count > 0;
452 : }
453 :
454 : // there is only one leader (ourselves)
455 : //
456 : // call the one_leader() function to verify that this is indeed correct
457 : // otherwise we would mess up the algorithm
458 : //
459 19 : return one_leader();
460 803 : }
461 :
462 :
463 : /** \brief Enter the mode that lets us retrieve our ticket number.
464 : *
465 : * In order to make sure we can get the current largest ticket number
466 : * in a unique enough way, cluck has to enter the lock loop. This
467 : * process starts by sending a `LOCK_ENTERING` message to all the
468 : * other cluckd leaders.
469 : */
470 119 : void ticket::entering()
471 : {
472 : // TODO implement the special case when there is only 1 leader
473 : // (on the other hand, that should be rather rare)
474 : //computer::pointer_t leader(f_cluckd->get_leader_a());
475 : //if(leader == nullptr)
476 : //{
477 : // -- do the necessary to obtain the lock --
478 : // return;
479 : //}
480 :
481 119 : ed::message entering_message;
482 119 : entering_message.set_command(cluck::g_name_cluck_cmd_lock_entering);
483 119 : entering_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
484 119 : entering_message.add_parameter(cluck::g_name_cluck_param_timeout, f_obtention_timeout);
485 119 : entering_message.add_parameter(cluck::g_name_cluck_param_duration, f_lock_duration);
486 119 : if(f_lock_duration != f_unlock_duration)
487 : {
488 4 : entering_message.add_parameter(cluck::g_name_cluck_param_unlock_duration, f_unlock_duration);
489 : }
490 119 : entering_message.add_parameter(cluck::g_name_cluck_param_source, f_server_name + "/" + f_service_name);
491 119 : entering_message.add_parameter(cluck::g_name_cluck_param_serial, f_serial);
492 119 : if(send_message_to_leaders(entering_message))
493 : {
494 119 : if(one_leader())
495 : {
496 : // there are no other leaders, make sure the algorithm progresses
497 : //
498 3 : entered();
499 : }
500 : }
501 238 : }
502 :
503 :
504 : /** \brief Tell this entering that we received a LOCKENTERED message.
505 : *
506 : * This function gets called each time we receive a `LOCKENTERED`
507 : * message with this ticket entering key.
508 : *
509 : * Since we have 1 to 3 leaders, the quorum and thus consensus is reached
510 : * as soon as we receive one `LOCKENTERED` message. So as a result this
511 : * function sends `GETMAXTICKET` the first time it gets called. The
512 : * `GETMAXTICKET` message allows us to determine the ticket number for
513 : * the concerned object.
514 : *
515 : * \note
516 : * The msg_lockentered() function first checked whether the
517 : * `LOCKENTERED` message had anything to do with this ticket.
518 : * If not, the message was just ignored.
519 : */
520 126 : void ticket::entered()
521 : {
522 : // is this ticket concerned?
523 : //
524 126 : if(!f_get_max_ticket)
525 : {
526 : // with 2 or 3 leaders, quorum is obtain with one
527 : // single acknowledgement
528 : //
529 116 : f_get_max_ticket = true;
530 :
531 : // calculate this instance max. ticket number
532 : //
533 116 : f_our_ticket = f_cluckd->get_last_ticket(f_object_name);
534 :
535 116 : ed::message get_max_ticket_message;
536 116 : get_max_ticket_message.set_command(cluck::g_name_cluck_cmd_get_max_ticket);
537 116 : get_max_ticket_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
538 116 : if(send_message_to_leaders(get_max_ticket_message))
539 : {
540 116 : if(one_leader())
541 : {
542 : // there are no other leaders, make sure the algorithm progresses
543 : //
544 3 : max_ticket(f_our_ticket);
545 : }
546 : }
547 116 : }
548 126 : }
549 :
550 :
551 : /** \brief Called whenever a MAX_TICKET is received.
552 : *
553 : * This function registers the largest ticket number. Once we reach
554 : * QUORUM, then we have the largest number and we can move on to the
555 : * next stage, which is to add the ticket.
556 : *
557 : * \note
558 : * We reach quorum immediately in our current implementation since we
559 : * have 1, 2, or 3 leaders. So this function takes the input in account
560 : * once, calls add_ticket() immediately and if the 3rd leader does send
561 : * a reply too, it gets ignored.
562 : *
563 : * \param[in] new_max_ticket Another possibly larger ticket.
564 : */
565 128 : void ticket::max_ticket(ticket_id_t new_max_ticket)
566 : {
567 128 : if(!f_added_ticket)
568 : {
569 117 : if(new_max_ticket > f_our_ticket)
570 : {
571 110 : f_our_ticket = new_max_ticket;
572 : }
573 :
574 117 : ++f_our_ticket;
575 117 : if(f_our_ticket == NO_TICKET)
576 : {
577 : // f_out_ticket is a 32 bit number, this can happen only if you
578 : // created over 4 billion locks back to back--i.e. created a new
579 : // one before the previous one was released; or put in a different
580 : // way: the list of tickets with that "object name" never went
581 : // back to being empty for that long...
582 : //
583 2 : throw cluck::out_of_range("ticket::max_ticket() tried to generate the next ticket and got a wrapping around number.");
584 : }
585 :
586 115 : add_ticket();
587 : }
588 126 : }
589 :
590 :
591 : /** \brief Send the ADD_TICKET message.
592 : *
593 : * This function sends the ADD_TICKET message to all the cluckd
594 : * instances currently known.
595 : *
596 : * \exception logic_error
597 : * This exception is raised if the function gets called twice or more.
598 : * Since it is considered an internal function, it should not be an issue.
599 : */
600 115 : void ticket::add_ticket()
601 : {
602 : // we expect exactly one call to this function
603 : //
604 115 : if(f_added_ticket)
605 : {
606 : throw cluck::logic_error("ticket::add_ticket() called more than once."); // LCOV_EXCL_LINE
607 : }
608 115 : f_added_ticket = true;
609 :
610 : //
611 : // WARNING: the ticket key MUST be properly sorted by:
612 : //
613 : // ticket number
614 : // server name
615 : // client pid
616 : //
617 : // The client PID does not need to be sorted numerically, just be sorted
618 : // so one client is before the other.
619 : //
620 : // However, the ticket number MUST be numerically sorted. For this reason,
621 : // since the key is a string, we must add introducing zeroes.
622 : //
623 345 : f_ticket_key = snapdev::int_to_hex(f_our_ticket, false, 8)
624 345 : + '/'
625 230 : + f_entering_key;
626 :
627 115 : f_cluckd->set_ticket(f_object_name, f_ticket_key, shared_from_this());
628 :
629 115 : ed::message add_ticket_message;
630 115 : add_ticket_message.set_command(cluck::g_name_cluck_cmd_add_ticket);
631 115 : add_ticket_message.add_parameter(cluck::g_name_cluck_param_key, f_ticket_key);
632 115 : add_ticket_message.add_parameter(cluck::g_name_cluck_param_timeout, f_obtention_timeout);
633 115 : if(send_message_to_leaders(add_ticket_message))
634 : {
635 115 : if(one_leader())
636 : {
637 3 : ticket_added(f_cluckd->get_entering_tickets(f_object_name));
638 : }
639 : }
640 230 : }
641 :
642 :
643 : /** \brief Called whenever a TICKET_ADDED is received.
644 : *
645 : * This function sends a LOCK_EXITING if the ticket reached the total number
646 : * of TICKET_ADDED required to get a quorum (which is just one with 1 to 3
647 : * leaders.)
648 : *
649 : * The \p still_entering paramater defines the list of tickets that are
650 : * still trying to enter the same object. This is very important. It needs
651 : * to be completely drained before we can proceed and mark the ticket as
652 : * assigned.
653 : *
654 : * \param[in] still_entering The list of still entering processes
655 : */
656 116 : void ticket::ticket_added(key_map_t const & still_entering)
657 : {
658 116 : if(!f_added_ticket_quorum)
659 : {
660 : // when we have 2 or 3 leaders, quorum is obtain with one
661 : // single acknowledgement
662 : //
663 114 : f_added_ticket_quorum = true;
664 :
665 114 : f_still_entering = still_entering;
666 :
667 : // okay, the ticket was added on all cluck daemons
668 : // now we can forget about the entering flag
669 : // (equivalent to setting it to false)
670 : //
671 114 : ed::message exiting_message;
672 114 : exiting_message.set_command(cluck::g_name_cluck_cmd_lock_exiting);
673 114 : exiting_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
674 114 : snapdev::NOT_USED(send_message_to_leaders(exiting_message));
675 :
676 114 : f_cluckd->lock_exiting(exiting_message);
677 114 : }
678 116 : }
679 :
680 :
681 : /** \brief Call any time time an entering flag is reset.
682 : *
683 : * This function gets called whenever an entering flag gets set
684 : * back to false (i.e. removed in our implementation).
685 : *
686 : * This function knows whether this ticket received its number
687 : * and is not yet ready. In both of these circumstances, we
688 : * are waiting for all entering flags that got created while
689 : * we determined the largest ticket number to be removed.
690 : *
691 : * \param[in] key The key of the ticket that was entered.
692 : */
693 10021 : void ticket::remove_entering(std::string const & key)
694 : {
695 10021 : if(f_added_ticket_quorum
696 5068 : && !f_ticket_ready)
697 : {
698 5067 : auto it(f_still_entering.find(key));
699 5067 : if(it != f_still_entering.end())
700 : {
701 5067 : f_still_entering.erase(it);
702 :
703 : // just like the quorum computation, we compute the
704 : // remaining list of entering tickets dynamically at
705 : // the time we check the value
706 : //
707 171721 : for(auto key_entering(f_still_entering.begin()); key_entering != f_still_entering.end(); )
708 : {
709 166654 : if(key_entering->second->timed_out())
710 : {
711 0 : key_entering = f_still_entering.erase(key_entering);
712 : }
713 : else
714 : {
715 166654 : ++key_entering;
716 : }
717 : }
718 :
719 : // once all removed, our ticket is ready!
720 : //
721 5067 : if(f_still_entering.empty())
722 : {
723 114 : f_ticket_ready = true;
724 :
725 : // let the other two leaders know that the ticket is ready
726 : //
727 114 : ed::message ticket_ready_message;
728 114 : ticket_ready_message.set_command(cluck::g_name_cluck_cmd_ticket_ready);
729 114 : ticket_ready_message.add_parameter(cluck::g_name_cluck_param_key, f_ticket_key);
730 114 : snapdev::NOT_USED(send_message_to_leaders(ticket_ready_message));
731 114 : }
732 : }
733 : }
734 10021 : }
735 :
736 :
737 : /** \brief Check whether this ticket can be activated and do so if so.
738 : *
739 : * This function checks whether the ticket is ready to be activated.
740 : * This means it got a ticket and the ticket is ready. If so, then
741 : * it sends the LOCKED message back to the system that required it.
742 : *
743 : * This function can be called multiple times. It will send
744 : * the ACTIVATE_LOCK message only once.
745 : *
746 : * On a system with only one computer, it will also send the LOCKED
747 : * message immediately.
748 : */
749 217 : void ticket::activate_lock()
750 : {
751 217 : if(f_ticket_ready
752 115 : && !f_locked
753 114 : && f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
754 : {
755 114 : ed::message activate_lock_message;
756 114 : activate_lock_message.set_command(cluck::g_name_cluck_cmd_activate_lock);
757 114 : activate_lock_message.add_parameter(cluck::g_name_cluck_param_key, f_ticket_key);
758 114 : if(send_message_to_leaders(activate_lock_message))
759 : {
760 114 : if(one_leader())
761 : {
762 3 : lock_activated();
763 : }
764 : }
765 114 : }
766 217 : }
767 :
768 :
769 : /** \brief Check whether this ticket can be activated and do so if so.
770 : *
771 : * This function checks whether the ticket is ready to be activated.
772 : * This means it got a ticket and the ticket is ready. If so, then
773 : * it sends the LOCKED message back to the system that required it.
774 : *
775 : * This function can be called multiple times. It will send
776 : * the LOCKED message only once.
777 : */
778 134 : void ticket::lock_activated()
779 : {
780 134 : if(f_ticket_ready
781 134 : && !f_locked
782 114 : && f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
783 : {
784 114 : f_locked = true;
785 114 : f_lock_timeout_date = snapdev::now() + f_lock_duration;
786 114 : f_unlocked_timeout_date = f_lock_timeout_date + f_unlock_duration;
787 :
788 114 : if(f_owner == f_cluckd->get_server_name())
789 : {
790 113 : ed::message locked_message;
791 113 : locked_message.set_command(cluck::g_name_cluck_cmd_locked);
792 113 : locked_message.set_server(f_server_name);
793 113 : locked_message.set_service(f_service_name);
794 113 : locked_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
795 113 : locked_message.add_parameter(cluck::g_name_cluck_param_timeout_date, f_lock_timeout_date);
796 113 : locked_message.add_parameter(cluck::g_name_cluck_param_unlocked_date, f_unlocked_timeout_date);
797 113 : locked_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
798 113 : f_messenger->send_message(locked_message);
799 113 : }
800 : }
801 134 : }
802 :
803 :
804 : /** \brief We are done with the ticket.
805 : *
806 : * This function sends the DROP_TICKET message to get rid of a ticket
807 : * from another leader's list of tickets.
808 : *
809 : * Another leader has a list of tickets as it receives LOCK and ADDTICKET
810 : * messages.
811 : */
812 111 : void ticket::drop_ticket()
813 : {
814 222 : SNAP_LOG_TRACE
815 : << "Unlock on \""
816 111 : << f_object_name
817 111 : << "\" ("
818 111 : << f_tag
819 : << ") with key \""
820 111 : << f_entering_key
821 : << "\"."
822 : << SNAP_LOG_SEND;
823 :
824 111 : ed::message drop_ticket_message;
825 111 : drop_ticket_message.set_command(cluck::g_name_cluck_cmd_drop_ticket);
826 111 : drop_ticket_message.add_parameter(
827 : cluck::g_name_cluck_param_key
828 111 : , f_ticket_key.empty() ? f_entering_key : f_ticket_key);
829 111 : send_message_to_leaders(drop_ticket_message);
830 :
831 111 : if(f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
832 : {
833 109 : f_lock_failed = lock_failure_t::LOCK_FAILURE_UNLOCKING;
834 :
835 : //if(f_owner == f_cluckd->get_server_name()) -- this can happen with any leader so we have to send the UNLOCKED
836 : // the other leaders won't call this function they receive DROP_TICKET
837 : // instead and as mentioned in the TODO below, we should get a QUORUM
838 : // instead...
839 : {
840 : // we can immediately say it got unlocked...
841 : //
842 : // TODO: this is true ONLY if you lock the same object no more than
843 : // once within a session, which is not unlikely false (it is
844 : // true for what I can remember of Snap!, but long term this
845 : // is not safe.) Like the LOCK, we need a quorum and then
846 : // send the UNLOCK... At this point, I'm not too sure how
847 : // we implement such because the drop_ticket function ends
848 : // up deleting the ticket from memory and thus no counting
849 : // can happen after that... (i.e. we need a special case
850 : // of the receiver for the UNLOCK, argh!)
851 : //
852 109 : ed::message unlocked_message;
853 109 : unlocked_message.set_command(cluck::g_name_cluck_cmd_unlocked);
854 109 : unlocked_message.set_server(f_server_name);
855 109 : unlocked_message.set_service(f_service_name);
856 109 : unlocked_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
857 109 : unlocked_message.add_parameter(cluck::g_name_cluck_param_unlocked_date, snapdev::now());
858 109 : unlocked_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
859 109 : f_messenger->send_message(unlocked_message);
860 109 : }
861 : }
862 222 : }
863 :
864 :
865 : /** \brief Let the service that wanted this lock know that it failed.
866 : *
867 : * This function sends a reply to the server that requested the lock to
868 : * let it know that it somehow failed.
869 : *
870 : * The function replies with a LOCK_FAILED when the lock was never
871 : * obtained. In this case the origin server cannot access the resources.
872 : *
873 : * The function replies with UNLOCKING when the lock timed out. The
874 : * server is expected to send an UNLOCK reply to acknowledge the
875 : * failure and fully release the lock. The lock will remain in place
876 : * until that acknowledgement is received or an amount of time
877 : * equal to the lock duration by default with a minimum of 1 minute.
878 : *
879 : * The UNLOCKING acknowledgement timeout is set to the same amount as
880 : * the LOCK duration if the `unlock_duration` parameter is not specified
881 : * in the LOCK message. When the `unlock_duration` parameter is specified,
882 : * then that amount is used instead.
883 : *
884 : * \note
885 : * The function may get called multiple times. The failure message
886 : * is sent only on the first call.
887 : *
888 : * \note
889 : * If the ticket was created on another cluck daemon (not the one that
890 : * received the LOCK event in the first place) then this ticket is
891 : * not marked as being owned by this cluck daemon and as a result this
892 : * function only marks the ticket as failed.
893 : *
894 : * \param[in] reason A reason for the failure (i.e. "timed out")
895 : */
896 14 : void ticket::lock_failed(std::string const & reason)
897 : {
898 : enum send_msg_t
899 : {
900 : SEND_MSG_NONE,
901 : SEND_MSG_UNLOCKING,
902 : SEND_MSG_UNLOCKED,
903 : SEND_MSG_FAILED,
904 : };
905 :
906 14 : send_msg_t send(SEND_MSG_NONE);
907 :
908 14 : switch(f_lock_failed)
909 : {
910 7 : case lock_failure_t::LOCK_FAILURE_NONE:
911 : // send that message at most once
912 : //
913 7 : f_lock_failed = lock_failure_t::LOCK_FAILURE_LOCK;
914 :
915 7 : if(f_locked)
916 : {
917 : // now we have to extend the lock timeout to make sure that
918 : // the UNLOCKING has a chance to be acknowledged
919 : //
920 5 : f_lock_timeout_date += f_unlock_duration;
921 5 : if(timed_out())
922 : {
923 : // this case is logical here, but I don't think it can
924 : // happen because the f_locked is true and thus the only
925 : // value we can use is f_lock_timeout_date and we just
926 : // increased that value by at least 3 seconds
927 : //
928 : send = SEND_MSG_UNLOCKED; // LCOV_EXCL_LINE
929 : }
930 : else
931 : {
932 5 : send = SEND_MSG_UNLOCKING;
933 : }
934 : }
935 : else
936 : {
937 2 : send = SEND_MSG_FAILED;
938 : }
939 7 : break;
940 :
941 5 : case lock_failure_t::LOCK_FAILURE_LOCK:
942 5 : f_lock_failed = lock_failure_t::LOCK_FAILURE_UNLOCKING;
943 :
944 5 : if(f_locked)
945 : {
946 3 : send = SEND_MSG_UNLOCKED;
947 : }
948 5 : break;
949 :
950 2 : case lock_failure_t::LOCK_FAILURE_UNLOCKING:
951 : // we already sent all the possible messages
952 2 : break;
953 :
954 : }
955 :
956 : // we want the f_lock_failed and f_lock_timeout_date set before returning
957 : //
958 14 : if(f_owner != f_cluckd->get_server_name())
959 : {
960 2 : return;
961 : }
962 :
963 12 : switch(send)
964 : {
965 3 : case SEND_MSG_NONE:
966 : // don't send another message
967 3 : break;
968 :
969 5 : case SEND_MSG_UNLOCKING:
970 : {
971 : // if we were locked and reach here, then the lock
972 : // timed out while locked but the unlock timeout was
973 : // not yet reached so just send an UNLOCKING message
974 : //
975 10 : SNAP_LOG_IMPORTANT
976 : << "Lock on \""
977 5 : << f_object_name
978 5 : << "\" ("
979 5 : << f_tag
980 : << ") with key \""
981 5 : << f_entering_key
982 : << "\" timed out its lock allowed time."
983 : << SNAP_LOG_SEND;
984 :
985 5 : ed::message lock_failed_message;
986 5 : lock_failed_message.set_command(cluck::g_name_cluck_cmd_unlocking);
987 5 : lock_failed_message.set_server(f_server_name);
988 5 : lock_failed_message.set_service(f_service_name);
989 5 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
990 5 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
991 5 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_error, cluck::g_name_cluck_value_timedout);
992 5 : f_messenger->send_message(lock_failed_message);
993 5 : }
994 : break;
995 :
996 3 : case SEND_MSG_UNLOCKED:
997 : {
998 : // if we were locked and/or unlocking and we reach here,
999 : // then the lock completely timed out and we immediately
1000 : // completely unlock with an UNLOCKED message
1001 : //
1002 : // IMPORTANT: that means the service should stop using the
1003 : // shared resources but there is absoltely no
1004 : // guarantee about that; however, this situation
1005 : // should only occur when a service somehow does
1006 : // not properly UNLOCK its lock
1007 : //
1008 6 : SNAP_LOG_IMPORTANT
1009 : << "Lock on \""
1010 3 : << f_object_name
1011 3 : << "\" ("
1012 3 : << f_tag
1013 : << ") with key \""
1014 3 : << f_entering_key
1015 : << "\" timed out its unlocking allowed time."
1016 : << SNAP_LOG_SEND;
1017 :
1018 3 : ed::message lock_failed_message;
1019 3 : lock_failed_message.set_command(cluck::g_name_cluck_cmd_unlocked);
1020 3 : lock_failed_message.set_server(f_server_name);
1021 3 : lock_failed_message.set_service(f_service_name);
1022 3 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
1023 3 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
1024 3 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_error, cluck::g_name_cluck_value_timedout);
1025 3 : f_messenger->send_message(lock_failed_message);
1026 3 : }
1027 : break;
1028 :
1029 1 : case SEND_MSG_FAILED:
1030 : {
1031 2 : SNAP_LOG_IMPORTANT
1032 : << "Lock on \""
1033 1 : << f_object_name
1034 1 : << "\" ("
1035 1 : << f_tag
1036 : << ") with key \""
1037 1 : << f_entering_key
1038 : << "\" failed."
1039 : << SNAP_LOG_SEND;
1040 :
1041 1 : ed::message lock_failed_message;
1042 1 : lock_failed_message.set_command(cluck::g_name_cluck_cmd_lock_failed);
1043 1 : lock_failed_message.set_server(f_server_name);
1044 1 : lock_failed_message.set_service(f_service_name);
1045 1 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_object_name, f_object_name);
1046 1 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_tag, f_tag);
1047 1 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_key, f_entering_key);
1048 1 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_error, cluck::g_name_cluck_value_failed);
1049 2 : lock_failed_message.add_parameter(cluck::g_name_cluck_param_description,
1050 : "ticket failed before or after the lock was obtained ("
1051 2 : + reason
1052 3 : + ")");
1053 1 : f_messenger->send_message(lock_failed_message);
1054 1 : }
1055 : break;
1056 :
1057 : }
1058 : }
1059 :
1060 :
1061 : /** \brief Define whether this ticket is the owner of that lock.
1062 : *
1063 : * Whenever comes time to send the LOCK, UNLOCK, or LOCK_FAILED messages,
1064 : * only the owner is expected to send it. This flag tells us who the
1065 : * owner is and thus who is responsible for sending that message.
1066 : *
1067 : * \todo
1068 : * The ownership has to travel to others whenever a leader disappears.
1069 : *
1070 : * \param[in] owner The name of this ticket owner.
1071 : */
1072 3 : void ticket::set_owner(std::string const & owner)
1073 : {
1074 3 : f_owner = owner;
1075 3 : }
1076 :
1077 :
1078 : /** \brief Return the name of this ticket's owner.
1079 : *
1080 : * This function returns the name of the owner of this ticket. When a
1081 : * leader dies out, its name stick around until a new leader gets
1082 : * assigned to it.
1083 : *
1084 : * The owner is actually the name of the sending server. So if leader 1
1085 : * is named "alfred" and it sends a ticket message (i.e. LOCK_ENTERING),
1086 : * then the ticket owner parameter will be set "alfred".
1087 : *
1088 : * The owner name is set when you create a ticket or by unserializing
1089 : * a ticket dump. Serialization is used to share tickets between
1090 : * cluck daemon when we lose a leader and a new computer becomes a
1091 : * new leader.
1092 : *
1093 : * \return The name of this ticket owner.
1094 : */
1095 9 : std::string const & ticket::get_owner() const
1096 : {
1097 9 : return f_owner;
1098 : }
1099 :
1100 :
1101 : /** \brief Retrieve the client process identifier.
1102 : *
1103 : * This function splits the entering key and return the process identifier.
1104 : * This is primarily used to resend a LOCK message since in most cases
1105 : * this information should not be required.
1106 : *
1107 : * \note
1108 : * This is not really information that the ticket is supposed to know about
1109 : * but well... there is now a case where we need to know this.
1110 : *
1111 : * \return The process identifier of this ticket owner.
1112 : */
1113 3 : pid_t ticket::get_client_pid() const
1114 : {
1115 3 : std::vector<std::string> segments;
1116 3 : if(snapdev::tokenize_string(segments, f_entering_key, "/") != 2)
1117 : {
1118 2 : throw cluck::invalid_parameter(
1119 : "ticket::get_client_pid() split f_entering_key \""
1120 2 : + f_entering_key
1121 5 : + "\" and did not get exactly two segments.");
1122 : }
1123 2 : std::int64_t value;
1124 2 : advgetopt::validator_integer::convert_string(segments[1], value);
1125 2 : return static_cast<pid_t>(value);
1126 3 : }
1127 :
1128 :
1129 : /** \brief Give the lock a serial number for some form of unicity.
1130 : *
1131 : * When we lose a leader, the unicity of the ticket may be required as we
1132 : * start sharing the tickets between the surviving leaders. This is done
1133 : * for the RELOCK message which attempts to restart the an old LOCK. In
1134 : * that case, two leaders end up attempt a RELOCK on the same ticket.
1135 : * To make sure that we can easily ignore the second attempt, we use
1136 : * the serial number to see that the exact same message is getting there
1137 : * twice.
1138 : *
1139 : * The cluck daemon uses the leader number as part of the serial
1140 : * number (bits 24 and 25) so it is unique among all the instances,
1141 : * at least until a cluck deamon dies and its unique numbers get
1142 : * mingled (and the old leaders may change their own number too...)
1143 : *
1144 : * \param[in] serial The serial number of the ticket.
1145 : */
1146 122 : void ticket::set_serial(serial_t serial)
1147 : {
1148 122 : f_serial = serial;
1149 122 : }
1150 :
1151 :
1152 : /** \brief Return the serial number of this ticket.
1153 : *
1154 : * This function returns the serial number of this ticket. See the
1155 : * set_serial() function for additional information about this number.
1156 : *
1157 : * \return The serial number of the ticket.
1158 : */
1159 4 : ticket::serial_t ticket::get_serial() const
1160 : {
1161 4 : return f_serial;
1162 : }
1163 :
1164 :
1165 : /** \brief Change the unlock duration to the specified value.
1166 : *
1167 : * If the service requesting a lock fails to acknowledge an unlock, then
1168 : * the lock still gets unlocked after this \p duration.
1169 : *
1170 : * By default, this parameter gets set to the same value as duration with
1171 : * a minimum of 3 seconds. When the message includes an `unlock_duration`
1172 : * parameter then that value is used instead.
1173 : *
1174 : * \note
1175 : * If \p duration is less than cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT,
1176 : * then cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT is used. At time of writing
1177 : * cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT is 3 seconds.
1178 : *
1179 : * \warning
1180 : * It is important to understand that as soon as an UNLOCKED event arrives,
1181 : * you should acknowledge it. Not doing so increases the risk that two or
1182 : * more processes access the same resource simultaneously.
1183 : *
1184 : * \param[in] duration The amount of time to acknowledge an UNLOCKED
1185 : * event; after that the lock is released no matter what.
1186 : */
1187 255 : void ticket::set_unlock_duration(cluck::timeout_t duration)
1188 : {
1189 255 : if(duration == cluck::CLUCK_DEFAULT_TIMEOUT)
1190 : {
1191 115 : duration = f_lock_duration;
1192 : }
1193 :
1194 255 : f_unlock_duration = std::clamp(
1195 : duration
1196 : , cluck::CLUCK_UNLOCK_MINIMUM_TIMEOUT
1197 255 : , cluck::CLUCK_MAXIMUM_TIMEOUT);
1198 255 : }
1199 :
1200 :
1201 : /** \brief Get unlock duration.
1202 : *
1203 : * The unlock duration is used in case the lock times out. It extends
1204 : * the lock duration for that much longer until the client acknowledge
1205 : * the locks or the lock really times out.
1206 : *
1207 : * \note
1208 : * If not yet set, this function returns zero (a null timestamp).
1209 : *
1210 : * \return The unlock acknowledgement timeout duration.
1211 : */
1212 3 : cluck::timeout_t ticket::get_unlock_duration() const
1213 : {
1214 3 : return f_unlock_duration;
1215 : }
1216 :
1217 :
1218 : /** \brief Mark the ticket as being ready.
1219 : *
1220 : * This ticket is marked as being ready.
1221 : *
1222 : * A ticket is ready when all the entering tickets were removed from it
1223 : * on the owning leader. On the other two leaders, the ticket gets marked
1224 : * as being ready once they receive the LOCKEXITING message.
1225 : */
1226 103 : void ticket::set_ready()
1227 : {
1228 103 : f_ticket_ready = true;
1229 103 : }
1230 :
1231 :
1232 : /** \brief Set the ticket number.
1233 : *
1234 : * The other two leaders receive the ticket number in the ADDTICKET
1235 : * message. That number must be saved in the ticket, somehow. This
1236 : * is the function we use to do that.
1237 : *
1238 : * It is very important to have the correct number (by default it is
1239 : * zero) since the algorithm asks for the maximum ticket number
1240 : * currently available and without that information that request
1241 : * cannot be answered properly.
1242 : *
1243 : * \param[in] number The ticket number to save in f_our_ticket.
1244 : */
1245 6 : void ticket::set_ticket_number(ticket_id_t const number)
1246 : {
1247 6 : if(f_our_ticket != NO_TICKET
1248 4 : || f_added_ticket)
1249 : {
1250 4 : throw cluck::logic_error("ticket::set_ticket_number() called with "
1251 4 : + std::to_string(number)
1252 6 : + " when f_our_ticket is already set to "
1253 8 : + std::to_string(f_our_ticket)
1254 10 : + ".");
1255 : }
1256 4 : f_added_ticket = true;
1257 :
1258 4 : f_our_ticket = number;
1259 12 : f_ticket_key = snapdev::int_to_hex(f_our_ticket, false, 8)
1260 12 : + '/'
1261 8 : + f_entering_key;
1262 4 : }
1263 :
1264 :
1265 : /** \brief Return the ticket number of this ticket.
1266 : *
1267 : * This function returns the ticket number of this ticket. This
1268 : * is generally used to determine the largest ticket number
1269 : * currently in use in order to attach a new ticket number
1270 : * to a lock object.
1271 : *
1272 : * By default the value is NO_TICKET meaning that no ticket number was
1273 : * yet assigned to that ticket object.
1274 : *
1275 : * \return The current ticket number.
1276 : */
1277 16 : ticket::ticket_id_t ticket::get_ticket_number() const
1278 : {
1279 16 : return f_our_ticket;
1280 : }
1281 :
1282 :
1283 : /** \brief Check whether this ticket is locked or not.
1284 : *
1285 : * This function returns true if the ticket is currently locked.
1286 : *
1287 : * \return true when the ticket was successfully locked at some point.
1288 : */
1289 5 : bool ticket::is_locked() const
1290 : {
1291 5 : return f_locked;
1292 : }
1293 :
1294 :
1295 : /** \brief Check whether the system only has one leader.
1296 : *
1297 : * The function check the number of known leaders. If just one, then it
1298 : * returns true. This is important for our algorithm to work properly
1299 : * in that one specific case.
1300 : *
1301 : * \return true if there is only one leader (i.e. one single computer in
1302 : * your whole cluster).
1303 : */
1304 483 : bool ticket::one_leader() const
1305 : {
1306 483 : return f_cluckd->get_computer_count() == 1;
1307 : }
1308 :
1309 :
1310 : /** \brief Get the obtention timeout date.
1311 : *
1312 : * This function returns the obtention timeout. Note that if the lock
1313 : * was already obtained, then this date may be in the past. You can test
1314 : * that by checking the get_lock_timeout() function first.
1315 : *
1316 : * \return The date when the obtention of the ticket timeouts.
1317 : */
1318 10 : cluck::timeout_t ticket::get_obtention_timeout() const
1319 : {
1320 10 : return f_obtention_timeout;
1321 : }
1322 :
1323 :
1324 : /** \brief Define a time when the ticket times out while waiting.
1325 : *
1326 : * This function defines the time threshold when to timeout this
1327 : * ticket in case a service does not reply to an ALIVE message.
1328 : *
1329 : * Whenever a leader dies, a ticket which is not locked yet may be
1330 : * transferred to another leader. To not attempt to lock a ticket
1331 : * for nothing, the new leader first checks that the service
1332 : * which requested that lock is indeed still alive by send an
1333 : * ALIVE message to it. In return, it expects an ABSOLUTELY
1334 : * reply.
1335 : *
1336 : * If the ABSOLUTELY reply does not make it in time (at this time
1337 : * we limit this to 5 seconds) then we consider that this service
1338 : * is not responsive and we cancel the lock altogether.
1339 : *
1340 : * To cancel this timeout, call the function with cluck::timeout_t()
1341 : * in \p timeout (i.e. zero duration).
1342 : *
1343 : * \note
1344 : * Since that message should happen while the cluck daemon
1345 : * is waiting for the LOCK event, the reply should be close to
1346 : * instantaneous. So 5 seconds is plenty until somehow your
1347 : * network is really busy or really large and the time for
1348 : * the message to travel is too long.
1349 : *
1350 : * \param[in] timeout The time when the ALIVE message times out.
1351 : */
1352 8 : void ticket::set_alive_timeout(cluck::timeout_t timeout)
1353 : {
1354 8 : if(timeout < cluck::timeout_t())
1355 : {
1356 1 : timeout = cluck::timeout_t();
1357 : }
1358 :
1359 8 : if(timeout < f_obtention_timeout)
1360 : {
1361 6 : f_alive_timeout = timeout;
1362 : }
1363 : else
1364 : {
1365 : // use the obtention timeout if smaller because that was the
1366 : // first premise that the client asked about
1367 : //
1368 2 : f_alive_timeout = f_obtention_timeout;
1369 : }
1370 8 : }
1371 :
1372 :
1373 : /** \brief Retrieve the lock duration.
1374 : *
1375 : * This function returns the lock duration in seconds as defined with
1376 : * the constructor.
1377 : *
1378 : * \return The lock duration in seconds.
1379 : */
1380 2 : cluck::timeout_t ticket::get_lock_duration() const
1381 : {
1382 2 : return f_lock_duration;
1383 : }
1384 :
1385 :
1386 : /** \brief Get the lock timeout date.
1387 : *
1388 : * This function returns the lock timeout. If not yet defined, the
1389 : * function will return zero.
1390 : *
1391 : * \note
1392 : * The ticket will immediately be assigned a timeout date when it
1393 : * gets activated.
1394 : *
1395 : * \return The date when the ticket will timeout or zero.
1396 : */
1397 11 : cluck::timeout_t ticket::get_lock_timeout_date() const
1398 : {
1399 11 : return f_lock_timeout_date;
1400 : }
1401 :
1402 :
1403 : /** \brief Get the current lock timeout date.
1404 : *
1405 : * This function returns the "current" lock timeout.
1406 : *
1407 : * The "current" timeout is one of:
1408 : *
1409 : * \li If the lock is being re-requested (after the loss of a leader) then
1410 : * the ALIVE timeout may be returned for a short period of time.
1411 : *
1412 : * \li If the lock was not yet obtained, this function returns the obtention
1413 : * timeout timestamp.
1414 : *
1415 : * \li Once the lock was obtained, the lock timeout gets defined and that
1416 : * one is returned instead.
1417 : *
1418 : * \li When the UNLOCK is received or the timeout happens and cluckd sends
1419 : * the UNLOCKING message, the function returns the unlock timeout. In
1420 : * this case, the \em f_lock_time_date field is still used.
1421 : *
1422 : * \note
1423 : * This is the date used in the timed_out() function.
1424 : *
1425 : * \return The date when the ticket will timeout or zero.
1426 : */
1427 259806 : cluck::timeout_t ticket::get_current_timeout_date() const
1428 : {
1429 259806 : if(f_alive_timeout > cluck::timeout_t())
1430 : {
1431 7 : return f_alive_timeout;
1432 : }
1433 :
1434 259799 : if(f_locked)
1435 : {
1436 451 : return f_lock_timeout_date;
1437 : }
1438 :
1439 259348 : return f_obtention_timeout;
1440 : }
1441 :
1442 :
1443 : /** \brief Check whether this ticket timed out.
1444 : *
1445 : * This function returns true if the ticket timed out in its current
1446 : * state and should be moved to its next state.
1447 : *
1448 : * The function calls the get_current_timeout_date() to select the correct
1449 : * date. This depends on the current state of the ticket (i.e. maybe we
1450 : * sent the ALIVE message and are using the alive time out value).
1451 : *
1452 : * There are five timeout dates that can happen:
1453 : *
1454 : * 1. Time to obtain a lock
1455 : * 2. Time to keep the lock alive
1456 : * 3. Time to wait for a reply after an UNLOCKING message
1457 : * 4. Time to wait for the UNLOCK message
1458 : * 5. Time to wait for the ALIVE reply (i.e. the ABSOLUTELY message)
1459 : *
1460 : * \return true if the ticket timed out in its current state.
1461 : */
1462 222942 : bool ticket::timed_out() const
1463 : {
1464 222942 : return get_current_timeout_date() <= snapdev::now();
1465 : }
1466 :
1467 :
1468 : /** \brief Retrieve the object name of this ticket.
1469 : *
1470 : * This function returns the name of the object associated with this
1471 : * lock (i.e. what is being locked).
1472 : *
1473 : * \return The object name of the ticket.
1474 : */
1475 13 : std::string const & ticket::get_object_name() const
1476 : {
1477 13 : return f_object_name;
1478 : }
1479 :
1480 :
1481 : /** \brief Retrieve the tag of this ticket.
1482 : *
1483 : * This function returns the tag of the object associated with this
1484 : * lock (i.e. the specific instance of the lock being locked).
1485 : *
1486 : * \return The tag associated with this ticket.
1487 : */
1488 2 : ed::dispatcher_match::tag_t ticket::get_tag() const
1489 : {
1490 2 : return f_tag;
1491 : }
1492 :
1493 :
1494 : /** \brief Retrieve the server name of this ticket.
1495 : *
1496 : * This function returns the name of the server associated with this
1497 : * lock, i.e. the server to which the LOCKED and UNLOCKED commands are to
1498 : * be sent back to.
1499 : *
1500 : * This name is also used in case of an error to send the LOCKFAILED back
1501 : * to the service that requested the lock.
1502 : *
1503 : * \return The server name of the ticket.
1504 : */
1505 7 : std::string const & ticket::get_server_name() const
1506 : {
1507 7 : return f_server_name;
1508 : }
1509 :
1510 :
1511 : /** \brief Retrieve the service name of this ticket.
1512 : *
1513 : * This function returns the name of the service associated with this
1514 : * lock. This is the service to which the LOCKED and UNLOCKED messages
1515 : * are sent.
1516 : *
1517 : * This name is also used in case of an error to send the LOCKFAILED back
1518 : * to the service that requested the lock.
1519 : *
1520 : * \return The service name of the ticket.
1521 : */
1522 7 : std::string const & ticket::get_service_name() const
1523 : {
1524 7 : return f_service_name;
1525 : }
1526 :
1527 :
1528 : /** \brief Retrieve a reference to the entering key of this ticket.
1529 : *
1530 : * This function returns the entering key of this ticket. The
1531 : * entering key is defined on instantiation so it is always available.
1532 : *
1533 : * \note
1534 : * By contrast, the ticket key is not available up until the time the
1535 : * ticket number is marked as valid.
1536 : *
1537 : * \return The entering key of this ticket.
1538 : */
1539 130 : std::string const & ticket::get_entering_key() const
1540 : {
1541 130 : return f_entering_key;
1542 : }
1543 :
1544 :
1545 : /** \brief Retrieve a reference to the ticket key.
1546 : *
1547 : * This function returns the ticket key of this ticket. The
1548 : * ticket key is only defined at a later time when the ticket has
1549 : * properly entered the bakery. It includes three parameters:
1550 : *
1551 : * \li Ticket number as a hexadecimal number of 8 digits,
1552 : * \li Server name of the server asking for the lock,
1553 : * \li Process Identifier (PID) of the service daemon asking for the lock.
1554 : *
1555 : * \note
1556 : * This function returns an empty string until the ticket key is available.
1557 : *
1558 : * \return The ticket key.
1559 : */
1560 118 : std::string const & ticket::get_ticket_key() const
1561 : {
1562 118 : return f_ticket_key;
1563 : }
1564 :
1565 :
1566 : /** \brief Serialize a ticket to send it over to another leader.
1567 : *
1568 : * This function serialize a ticket to share it with the other
1569 : * leaders. This is important when a new leader gets elected
1570 : * as it would not otherwise have any idea of what the existing
1571 : * tickets are, although it is not 100% important, if another
1572 : * of the two snaplock was to go down, it becomes primordial
1573 : * for the tickets to be known in the other leaders.
1574 : *
1575 : * This is used at the start before a leader starts accepting new
1576 : * lock requests.
1577 : *
1578 : * \return This ticket as a serialized string.
1579 : *
1580 : * \sa unserialize()
1581 : */
1582 9 : std::string ticket::serialize() const
1583 : {
1584 9 : std::map<std::string, std::string> data;
1585 :
1586 9 : data["object_name"] = f_object_name;
1587 9 : data["tag"] = std::to_string(static_cast<int>(f_tag));
1588 9 : data["obtention_timeout"] = f_obtention_timeout.to_timestamp(true);
1589 : //data["alive_timeout"] = f_alive_timeout.to_timestamp(true); -- we do not want to transfer this one
1590 9 : data["lock_duration"] = f_lock_duration.to_timestamp(true);
1591 9 : data["unlock_duration"] = f_unlock_duration.to_timestamp(true);
1592 9 : data["server_name"] = f_server_name;
1593 9 : data["service_name"] = f_service_name;
1594 9 : data["owner"] = f_owner;
1595 9 : if(f_serial != NO_SERIAL)
1596 : {
1597 5 : data["serial"] = std::to_string(f_serial);
1598 : }
1599 9 : data["entering_key"] = f_entering_key;
1600 9 : data["get_max_ticket"] = f_get_max_ticket ? "true" : "false";
1601 9 : data["our_ticket"] = std::to_string(f_our_ticket);
1602 9 : data["added_ticket"] = f_added_ticket ? "true" : "false";
1603 9 : data["ticket_key"] = f_ticket_key;
1604 9 : data["added_ticket_quorum"] = f_added_ticket_quorum ? "true" : "false";
1605 :
1606 : // this is a map
1607 : //data["still_entering"] = f_still_entering;
1608 : //ticket::key_map_t f_still_entering = key_map_t();
1609 :
1610 9 : data["ticket_ready"] = f_ticket_ready ? "true" : "false";
1611 9 : data["locked"] = f_locked ? "true" : "false";
1612 9 : data["lock_timeout_date"] = f_lock_timeout_date.to_timestamp(true);
1613 :
1614 9 : switch(f_lock_failed)
1615 : {
1616 5 : case lock_failure_t::LOCK_FAILURE_NONE:
1617 5 : data["lock_failed"] = "none";
1618 5 : break;
1619 :
1620 1 : case lock_failure_t::LOCK_FAILURE_LOCK:
1621 1 : data["lock_failed"] = "lock";
1622 1 : break;
1623 :
1624 3 : case lock_failure_t::LOCK_FAILURE_UNLOCKING:
1625 3 : data["lock_failed"] = "unlocking";
1626 3 : break;
1627 :
1628 : }
1629 :
1630 9 : std::string result;
1631 176 : for(auto & it : data)
1632 : {
1633 167 : result += it.first;
1634 167 : result += '=';
1635 : // make sure the value does not include any '|'
1636 334 : result += snapdev::string_replace_many(it.second, {{"|", "%7C"}});
1637 167 : result += '|';
1638 : }
1639 9 : result.pop_back();
1640 :
1641 18 : return result;
1642 9 : }
1643 :
1644 :
1645 : /** \brief Unserialize a ticket string back to a ticket object.
1646 : *
1647 : * This function unserialize a string that was generated using the
1648 : * serialize() function.
1649 : *
1650 : * Note that unknown fields are ignored and none of the fields are
1651 : * considered mandatory. Actually the function generates no errors.
1652 : * This means it should be forward compatible.
1653 : *
1654 : * The data gets unserialized in `this` object.
1655 : *
1656 : * \param[in] data The serialized data.
1657 : */
1658 7 : void ticket::unserialize(std::string const & data)
1659 : {
1660 7 : std::vector<std::string> vars;
1661 7 : snapdev::NOT_USED(snapdev::tokenize_string(vars, data, "|"));
1662 136 : for(auto const & d : vars)
1663 : {
1664 129 : std::string::size_type const pos(d.find('='));
1665 129 : std::string const name(d.substr(0, pos));
1666 129 : std::string const value(d.substr(pos + 1));
1667 129 : switch(name[0])
1668 : {
1669 14 : case 'a':
1670 14 : if(name == "added_ticket")
1671 : {
1672 7 : f_added_ticket = f_added_ticket || value == "true";
1673 : }
1674 7 : else if(name == "added_ticket_quorum")
1675 : {
1676 7 : f_added_ticket_quorum = f_added_ticket_quorum || value == "true";
1677 : }
1678 : //else if(name == "alive_timeout") -- we do not transfer this one (not required, and could actually cause problems)
1679 : //{
1680 : // f_alive_timeout = cluck::timeout_t(value);
1681 : //}
1682 14 : break;
1683 :
1684 7 : case 'e':
1685 7 : if(name == "entering_key")
1686 : {
1687 : #ifdef _DEBUG
1688 7 : if(f_entering_key != value)
1689 : {
1690 : // LCOV_EXCL_START
1691 : throw cluck::logic_error(
1692 : "ticket::unserialize() not unserializing entering key \""
1693 : + value
1694 : + "\" over itself \""
1695 : + f_entering_key
1696 : + "\" (entering key mismatch).");
1697 : // LCOV_EXCL_STOP
1698 : }
1699 : #endif
1700 7 : f_entering_key = value;
1701 : }
1702 7 : break;
1703 :
1704 7 : case 'g':
1705 7 : if(name == "get_max_ticket")
1706 : {
1707 7 : f_get_max_ticket = f_get_max_ticket || value == "true";
1708 : }
1709 7 : break;
1710 :
1711 28 : case 'l':
1712 28 : if(name == "lock_duration")
1713 : {
1714 7 : f_lock_duration = cluck::timeout_t(value);
1715 : }
1716 21 : else if(name == "locked")
1717 : {
1718 7 : f_locked = f_locked || value == "true";
1719 : }
1720 14 : else if(name == "lock_timeout_date")
1721 : {
1722 : // the time may be larger because of an UNLOCK so we keep
1723 : // the largest value
1724 : //
1725 7 : cluck::timeout_t const timeout_date(value);
1726 7 : if(timeout_date > f_lock_timeout_date)
1727 : {
1728 1 : f_lock_timeout_date = timeout_date;
1729 : }
1730 : }
1731 7 : else if(name == "lock_failed")
1732 : {
1733 : // in this case, we avoid reducing the error level
1734 : //
1735 7 : if(value == "unlocking")
1736 : {
1737 3 : f_lock_failed = lock_failure_t::LOCK_FAILURE_UNLOCKING;
1738 : }
1739 4 : else if(value == "lock" && f_lock_failed == lock_failure_t::LOCK_FAILURE_NONE)
1740 : {
1741 1 : f_lock_failed = lock_failure_t::LOCK_FAILURE_LOCK;
1742 : }
1743 : }
1744 28 : break;
1745 :
1746 28 : case 'o':
1747 28 : if(name == "object_name")
1748 : {
1749 : #ifdef _DEBUG
1750 7 : if(f_object_name != value)
1751 : {
1752 : // LCOV_EXCL_START
1753 : throw cluck::logic_error(
1754 : "ticket::unserialize() not unserializing object name \""
1755 : + value
1756 : + "\" over itself \""
1757 : + f_object_name
1758 : + "\" (object name mismatch).");
1759 : // LCOV_EXCL_STOP
1760 : }
1761 : #endif
1762 7 : f_object_name = value;
1763 : }
1764 21 : else if(name == "obtention_timeout")
1765 : {
1766 7 : f_obtention_timeout = cluck::timeout_t(value);
1767 : }
1768 14 : else if(name == "owner")
1769 : {
1770 7 : f_owner = value;
1771 : }
1772 7 : else if(name == "our_ticket")
1773 : {
1774 7 : std::int64_t v;
1775 7 : advgetopt::validator_integer::convert_string(value, v);
1776 7 : f_our_ticket = v;
1777 : }
1778 28 : break;
1779 :
1780 17 : case 's':
1781 17 : if(name == "server_name")
1782 : {
1783 7 : f_server_name = value;
1784 : }
1785 10 : else if(name == "service_name")
1786 : {
1787 7 : f_service_name = value;
1788 : }
1789 3 : else if(name == "serial")
1790 : {
1791 3 : std::int64_t v;
1792 3 : advgetopt::validator_integer::convert_string(value, v);
1793 3 : f_serial = v;
1794 : }
1795 17 : break;
1796 :
1797 21 : case 't':
1798 21 : if(name == "tag")
1799 : {
1800 7 : std::int64_t v;
1801 7 : advgetopt::validator_integer::convert_string(value, v);
1802 7 : f_tag = v;
1803 : }
1804 14 : else if(name == "ticket_key")
1805 : {
1806 7 : f_ticket_key = value;
1807 : }
1808 7 : else if(name == "ticket_ready")
1809 : {
1810 7 : f_ticket_ready = f_ticket_ready || value == "true";
1811 : }
1812 21 : break;
1813 :
1814 7 : case 'u':
1815 7 : if(name == "unlock_duration")
1816 : {
1817 7 : f_unlock_duration = cluck::timeout_t(value);
1818 : }
1819 7 : break;
1820 :
1821 : }
1822 129 : }
1823 14 : }
1824 :
1825 :
1826 :
1827 : } // namespace cluck_daemon
1828 : // vim: ts=4 sw=4 et
|