Line data Source code
1 : // Snap Websites Server -- snap websites serving children
2 : // Copyright (c) 2011-2019 Made to Order Software Corp. All Rights Reserved
3 : //
4 : // https://snapwebsites.org/
5 : // contact@m2osw.com
6 : //
7 : // This program is free software; you can redistribute it and/or modify
8 : // it under the terms of the GNU General Public License as published by
9 : // the Free Software Foundation; either version 2 of the License, or
10 : // (at your option) any later version.
11 : //
12 : // This program is distributed in the hope that it will be useful,
13 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 : // GNU General Public License for more details.
16 : //
17 : // You should have received a copy of the GNU General Public License
18 : // along with this program; if not, write to the Free Software
19 : // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 :
21 :
22 : // self
23 : //
24 : #include "snapwebsites/snap_backend.h"
25 :
26 :
27 : // snapwebsites lib
28 : //
29 : #include "snapwebsites/snapwebsites.h"
30 : #include "snapwebsites/log.h"
31 : #include "snapwebsites/snap_lock.h"
32 :
33 :
34 : // snapdev lib
35 : //
36 : #include <snapdev/not_used.h>
37 :
38 :
39 : // C lib
40 : //
41 : #include <wait.h>
42 : #include <fcntl.h>
43 : #include <sys/resource.h>
44 :
45 :
46 : // last include
47 : //
48 : #include <snapdev/poison.h>
49 :
50 :
51 :
52 :
53 :
54 : namespace snap
55 : {
56 :
57 : /** \class snap_backend
58 : * \brief Backend process class.
59 : *
60 : * This class handles backend processing for the snapserver.
61 : *
62 : * The process for backends works this way:
63 : *
64 : * \li Backend tool prepares the server
65 : * \li Backend tool creates a snap_backend object.
66 : * \li Backend tool calls run_backend()
67 : * \li run_backend() connects to the database
68 : * \li run_backend() checks whether the "sites" table exists
69 : * \li if not ready -- wait until the "sites" table exists
70 : * \li -- while waiting for the "sites" table, we also listen for
71 : * incoming messages such as STOP and LOG
72 : *
73 : * Note that the backend, like pretty much all other Snap processes
74 : * is event based. It receives messages from various sources and
75 : * deals with those as required. The following describes those
76 : * messages:
77 : *
78 : * \msc
79 : * hscale = 2;
80 : * a [label="snapcommunicator"],
81 : * b [label="snapbackend"],
82 : * c [label="child process"],
83 : * d [label="wakeup timer"],
84 : * e [label="cassandra"];
85 : *
86 : * #
87 : * # Register to the snap communicator
88 : * #
89 : * b=>b [label="open socket to snapcommunicator"];
90 : * b->a [label="REGISTER service=<service name>;version=<version>"];
91 : * a->b [label="READY"];
92 : * a->b [label="HELP"];
93 : * b->a [label="COMMANDS list=HELP,LOG,PING,READY,..."];
94 : *
95 : * #
96 : * # Start a child at a specified time
97 : * #
98 : * b=>b [label="wakeup timer timed out"];
99 : * b->e [label="lock website"];
100 : * b->c [label="start child"];
101 : * c->b [label="child died"];
102 : * b->e [label="unlock website"];
103 : *
104 : * #
105 : * # Start a child periodically
106 : * #
107 : * b=>b [label="tick timer timed out"];
108 : * b->e [label="lock website"];
109 : * b->c [label="start child"];
110 : * c->b [label="child died"];
111 : * b->e [label="unlock website"];
112 : *
113 : * #
114 : * # When the child dies
115 : * #
116 : * b=>b [label="another run is already schedule"];
117 : * b->e [label="lock website"];
118 : * b->c [label="start child"];
119 : * c->b [label="child died"];
120 : * b->e [label="unlock website"];
121 : *
122 : * #
123 : * # PING is received
124 : * #
125 : * a->b [label="PING sent to backend"];
126 : * b=>b [label="register request in database"];
127 : * b->e [label="lock website"];
128 : * b->c [label="start child"];
129 : * c->b [label="child died"];
130 : * b->e [label="unlock website"];
131 : * \endmsc
132 : *
133 : * \note
134 : * If a child is already running, then it does not get started a
135 : * second time. This is quite important since if you have a large
136 : * number of websites (say 1,000) then you could otherwise get that
137 : * many processes running simultaneously... Instead we run at most
138 : * one child per instance of the snapbackend process. You may, however,
139 : * have one instance per computer in your cluster so as to alleviate
140 : * the load through multi-processing.
141 : *
142 : * \sa snap_child
143 : */
144 :
145 :
146 :
147 : namespace plugins
148 : {
149 : extern QString g_next_register_name;
150 : extern QString g_next_register_filename;
151 : }
152 :
153 : namespace
154 : {
155 :
156 :
157 : /** \brief The communicator.
158 : *
159 : * This is the pointer to the communicator. Since the communicator is
160 : * a singleton, we have only one and thus can keep a copy here.
161 : */
162 2 : snap_communicator::pointer_t g_communicator;
163 :
164 :
165 : /** \brief Handle the SIGINT that is expected to stop the server.
166 : *
167 : * This class is an implementation of the snap_signal that listens
168 : * on the SIGINT.
169 : */
170 0 : class backend_interrupt
171 : : public snap::snap_communicator::snap_signal
172 : {
173 : public:
174 : typedef std::shared_ptr<backend_interrupt> pointer_t;
175 :
176 : backend_interrupt(snap_backend * b);
177 : backend_interrupt(backend_interrupt const & rhs) = delete;
178 :
179 : backend_interrupt operator = (backend_interrupt const & rhs) = delete;
180 :
181 : // snap::snap_communicator::snap_signal implementation
182 : virtual void process_signal() override;
183 :
184 : private:
185 : snap_backend * f_snap_backend = nullptr;
186 : };
187 :
188 :
189 2 : backend_interrupt::pointer_t g_interrupt;
190 :
191 :
192 : /** \brief The interrupt initialization.
193 : *
194 : * The interrupt uses the signalfd() function to obtain a way to listen on
195 : * incoming Unix signals.
196 : *
197 : * Specifically, it listens on the SIGINT signal, which is the equivalent
198 : * to the Ctrl-C.
199 : *
200 : * \param[in] b The server we are listening for.
201 : */
202 0 : backend_interrupt::backend_interrupt(snap_backend * b)
203 : : snap_signal(SIGINT)
204 0 : , f_snap_backend(b)
205 : {
206 0 : unblock_signal_on_destruction();
207 0 : set_name("snap_backend interrupt");
208 0 : }
209 :
210 :
211 : /** \brief Call the stop function of the snaplock object.
212 : *
213 : * When this function is called, the signal was received and thus we are
214 : * asked to quit as soon as possible.
215 : */
216 0 : void backend_interrupt::process_signal()
217 : {
218 : // we simulate the STOP, so pass 'false' (i.e. not quitting)
219 : //
220 0 : f_snap_backend->stop(false);
221 0 : }
222 :
223 :
224 :
225 :
226 :
227 : /** \brief Capture children death.
228 : *
229 : * This class used used to create a connection on started that allows
230 : * us to know when a child dies. Whenever that happens, we get a call
231 : * to the process_signal() callback.
232 : */
233 0 : class signal_child_death
234 : : public snap_communicator::snap_signal
235 : {
236 : public:
237 : typedef std::shared_ptr<signal_child_death> pointer_t;
238 :
239 : signal_child_death(snap_backend * sb);
240 : signal_child_death(signal_child_death const & rhs) = delete;
241 :
242 : signal_child_death & operator = (signal_child_death const & rhs) = delete;
243 :
244 : // snap_communicator::snap_signal implementation
245 : virtual void process_signal();
246 :
247 : private:
248 : // TBD: should this be a weak pointer?
249 : snap_backend * f_snap_backend = nullptr;
250 : };
251 :
252 :
253 2 : signal_child_death::pointer_t g_signal_child_death;
254 :
255 :
256 : /** \brief Initialize the child death signal.
257 : *
258 : * The function initializes the snap_signal to listen on the SIGCHLD
259 : * Unix signal. It also saves the pointer \p s to the server so
260 : * it can be used to call various functions in the server whenever
261 : * the signal occurs.
262 : *
263 : * \param[in] s The server pointer.
264 : */
265 0 : signal_child_death::signal_child_death(snap_backend * sb)
266 : : snap_signal(SIGCHLD)
267 0 : , f_snap_backend(sb)
268 : {
269 0 : set_name("snap_backend signal_child_death");
270 0 : }
271 :
272 :
273 : /** \brief Callback called each time the SIGCHLD signal occurs.
274 : *
275 : * This function gets called each time a child dies.
276 : *
277 : * The function checks all the children and removes zombies.
278 : */
279 0 : void signal_child_death::process_signal()
280 : {
281 : // check all our children and remove zombies
282 : //
283 0 : f_snap_backend->capture_zombies(get_child_pid());
284 0 : }
285 :
286 :
287 :
288 : /** \brief Time the CASSANDRASTATUS message.
289 : *
290 : * When the snapbackend process is started and it never receives a replies
291 : * to its CASSANDRASTATUS, it has to time out. This timer is used for that
292 : * purpose.
293 : */
294 0 : class cassandra_timer
295 : : public snap::snap_communicator::snap_timer
296 : {
297 : public:
298 : typedef std::shared_ptr<cassandra_timer> pointer_t;
299 :
300 : static int64_t const MAX_START_INTERVAL = 60LL * 1000000LL; // 1 minute in microseconds
301 :
302 : cassandra_timer(snap_backend * sb);
303 : cassandra_timer(cassandra_timer const & rhs) = delete;
304 :
305 : cassandra_timer & operator = (cassandra_timer const & rhs) = delete;
306 :
307 : // snap::snap_communicator::snap_timer implementation
308 : virtual void process_timeout();
309 :
310 : private:
311 : snap_backend * f_snap_backend = nullptr;
312 : };
313 :
314 :
315 : /** \brief The reconnect timer.
316 : *
317 : * We create one cassandra timer. It is saved in this variable if needed.
318 : */
319 2 : cassandra_timer::pointer_t g_cassandra_timer;
320 :
321 :
322 : /** \brief Initializes the timer with a pointer to the snap backend.
323 : *
324 : * The constructor saves the pointer of the snap_backend object so
325 : * it can later be used when the process times out.
326 : *
327 : * The timer is setup to trigger after 30 seconds when enabled.
328 : *
329 : * \param[in] sb A pointer to the snap_backend object.
330 : */
331 0 : cassandra_timer::cassandra_timer(snap_backend * sb)
332 : : snap_timer(-1)
333 0 : , f_snap_backend(sb)
334 : {
335 0 : set_name("snap_backend cassandra_timer");
336 0 : }
337 :
338 :
339 : /** \brief The timeout happened.
340 : *
341 : * This function gets called after a few seconds whenever it is enabled,
342 : * it will allow us to exit the snapbackend tool if it never connects
343 : * to Cassandra.
344 : *
345 : * The function calls the stop() function of the snap_backend class.
346 : */
347 0 : void cassandra_timer::process_timeout()
348 : {
349 0 : f_snap_backend->stop(false);
350 0 : }
351 :
352 :
353 :
354 :
355 : /** \brief The timer used when a connection to Cassandra fails.
356 : *
357 : * When we receive the CASSANDRAREADY event, the connection is likely to
358 : * work. However, over time, while reading data in various loops, we may
359 : * end up with an exception and that stops the connection right there.
360 : * In other words, on return the f_cassandra pointer will be reset back
361 : * to a null pointer.
362 : *
363 : * To allow for a little bit of time before reconnecting, we use this
364 : * timer. Because in most cases this happens when Cassandra is rather
365 : * overloaded so trying to reconnect immediately at this stage is not
366 : * a good plan.
367 : *
368 : * At this time we setup the timer to 30 seconds. A snapbackend child
369 : * continues to be fully functional if its connection did not die, so
370 : * a longer pause should not be much of a problem.
371 : *
372 : * This will be much faster than the 5min to 1h auto-restart delay in
373 : * the various snapbackend service files.
374 : */
375 0 : class reconnect_timer
376 : : public snap::snap_communicator::snap_timer
377 : {
378 : public:
379 : typedef std::shared_ptr<reconnect_timer> pointer_t;
380 :
381 : reconnect_timer(snap_backend * sb);
382 : reconnect_timer(reconnect_timer const & rhs) = delete;
383 :
384 : reconnect_timer & operator = (reconnect_timer const & rhs) = delete;
385 :
386 : // snap::snap_communicator::snap_timer implementation
387 : virtual void process_timeout();
388 :
389 : private:
390 : snap_backend * f_snap_backend = nullptr;
391 : };
392 :
393 :
394 : /** \brief The reconnect timer.
395 : *
396 : * We create one reconnect timer. It is saved in this variable if needed.
397 : */
398 2 : reconnect_timer::pointer_t g_reconnect_timer;
399 :
400 :
401 : /** \brief Initializes the timer with a pointer to the snap backend.
402 : *
403 : * The constructor saves the pointer of the snap_backend object so
404 : * it can later be used when the process times out.
405 : *
406 : * The timer is setup to trigger after 30 seconds when enabled.
407 : *
408 : * \param[in] sb A pointer to the snap_backend object.
409 : */
410 0 : reconnect_timer::reconnect_timer(snap_backend * sb)
411 : : snap_timer(-1)
412 0 : , f_snap_backend(sb)
413 : {
414 0 : set_name("snap_backend reconnect_timer");
415 0 : }
416 :
417 :
418 : /** \brief The timeout happened.
419 : *
420 : * This function gets called after a few seconds whenever it is enabled,
421 : * it will allow us to reconnect to the Cassandra database.
422 : */
423 0 : void reconnect_timer::process_timeout()
424 : {
425 0 : f_snap_backend->process_reconnect();
426 0 : }
427 :
428 :
429 :
430 :
431 : /** \brief The timer to produce ticks once every five minutes.
432 : *
433 : * This timer makes sure that every website is re-added to the
434 : * backend table once every five minutes. Whether the backend
435 : * needs to be run against that website is ignored in this case.
436 : */
437 0 : class tick_timer
438 : : public snap::snap_communicator::snap_timer
439 : {
440 : public:
441 : typedef std::shared_ptr<tick_timer> pointer_t;
442 :
443 : static int64_t const MAX_START_INTERVAL = 5LL * 60LL * 1000000LL; // 5 minutes in microseconds
444 :
445 : tick_timer(snap_backend * sb);
446 : tick_timer(tick_timer const & rhs) = delete;
447 :
448 : tick_timer & operator = (tick_timer const & rhs) = delete;
449 :
450 : void configure(QDomElement e, QString const & binary_path, bool const debug);
451 :
452 : // snap::snap_communicator::snap_timer implementation
453 : virtual void process_timeout();
454 :
455 : private:
456 : snap_backend * f_snap_backend = nullptr;
457 : };
458 :
459 :
460 : /** \brief The tick timer.
461 : *
462 : * We create one tick timer. It is saved in this variable if needed.
463 : */
464 2 : tick_timer::pointer_t g_tick_timer;
465 :
466 :
467 : /** \brief Initializes the timer with a pointer to the snap backend.
468 : *
469 : * The constructor saves the pointer of the snap_backend object so
470 : * it can later be used when the process times out.
471 : *
472 : * The timer is setup to trigger immediately after creation.
473 : * This is what starts the snap backend process.
474 : *
475 : * \param[in] sb A pointer to the snap_backend object.
476 : */
477 0 : tick_timer::tick_timer(snap_backend * sb)
478 : : snap_timer(MAX_START_INTERVAL)
479 0 : , f_snap_backend(sb)
480 : {
481 0 : set_name("snap_backend tick_timer");
482 :
483 : // prevent tick_timer() from starting, we want to Cassandra
484 : // connection to be ready first
485 : //
486 0 : set_enable(false);
487 :
488 : // make sure it starts right away once we receive the CASSANDRAREADY
489 : // message
490 : //
491 : // we do not want to use snap_timer(0) because otherwise we will not
492 : // get ongoing ticks as expected
493 : //
494 0 : set_timeout_date(snap_communicator::get_current_date());
495 0 : }
496 :
497 :
498 : /** \brief The timeout happened.
499 : *
500 : * This function gets called once every five minutes, which is used to reset
501 : * the backend table so the backend processes are run against every websites
502 : * over and over again.
503 : */
504 0 : void tick_timer::process_timeout()
505 : {
506 0 : f_snap_backend->process_tick();
507 0 : }
508 :
509 :
510 :
511 :
512 : /** \brief The timer to produce wake up calls once in a while.
513 : *
514 : * This timer is used to wake us once in a while as determined by other
515 : * features. The date feature is always used on this timer (i.e. wake up
516 : * the process at a specific date and time in microseconds.)
517 : */
518 0 : class wakeup_timer
519 : : public snap::snap_communicator::snap_timer
520 : {
521 : public:
522 : typedef std::shared_ptr<wakeup_timer> pointer_t;
523 :
524 : wakeup_timer(snap_backend * sb);
525 : wakeup_timer(wakeup_timer const & rhs) = delete;
526 :
527 : wakeup_timer & operator = (wakeup_timer const & rhs) = delete;
528 :
529 : void configure(QDomElement e, QString const & binary_path, bool const debug);
530 :
531 : // snap::snap_communicator::snap_timer implementation
532 : virtual void process_timeout();
533 :
534 : private:
535 : snap_backend * f_snap_backend = nullptr;
536 : };
537 :
538 :
539 : /** \brief The wakeup timer.
540 : *
541 : * We create one wakeup timer. It is saved in this variable if needed.
542 : */
543 2 : wakeup_timer::pointer_t g_wakeup_timer;
544 :
545 :
546 : /** \brief Initializes the timer with a pointer to the snap backend.
547 : *
548 : * The constructor saves the pointer of the snap_backend object so
549 : * it can later be used when the process times out.
550 : *
551 : * By default the timer is "off" meaning that it will not trigger
552 : * a process_timeout() call until you turn it on.
553 : *
554 : * \param[in] sb A pointer to the snap_backend object.
555 : */
556 0 : wakeup_timer::wakeup_timer(snap_backend * sb)
557 : : snap_timer(-1)
558 0 : , f_snap_backend(sb)
559 : {
560 0 : set_name("snap_backend wakeup_timer");
561 0 : }
562 :
563 :
564 : /** \brief The wake up timer timed out.
565 : *
566 : * The wake up timer is used to know when we can start another child.
567 : *
568 : * Whenever the current child dies, we check when the next child should
569 : * be started. If the backend table is empty, then the wake up timer is
570 : * not set and nothing happens. However, when the backend table has an
571 : * entry, we get the first one and use that date and the next trigger
572 : * (if the trigger is now or in the past, then it is not used, we
573 : * directly create the next child instance.)
574 : *
575 : * The messenger may receive a PING in between in which case the timer
576 : * may be reset to a different date and time at which to wake up.
577 : */
578 0 : void wakeup_timer::process_timeout()
579 : {
580 0 : f_snap_backend->process_timeout();
581 0 : }
582 :
583 :
584 :
585 :
586 :
587 : /** \brief Handle messages from the Snap Communicator server.
588 : *
589 : * This class is an implementation of the TCP client message connection
590 : * so we can handle incoming messages.
591 : */
592 0 : class messenger
593 : : public snap_communicator::snap_tcp_client_permanent_message_connection
594 : {
595 : public:
596 : typedef std::shared_ptr<messenger> pointer_t;
597 :
598 : messenger(snap_backend * sb, QString const & action, std::string const & addr, int port);
599 : messenger(messenger const & rhs) = delete;
600 :
601 : messenger & operator = (messenger const & rhs) = delete;
602 :
603 : // snap::snap_communicator::snap_tcp_client_permanent_message_connection implementation
604 : virtual void process_message(snap::snap_communicator_message const & message);
605 : virtual void process_connection_failed(std::string const & error_message);
606 : virtual void process_connected();
607 :
608 : private:
609 : // this is owned by a server function so no need for a smart pointer
610 : snap_backend * f_snap_backend = nullptr;
611 : QString f_action = QString();
612 : };
613 :
614 :
615 : /** \brief The messenger.
616 : *
617 : * We create only one messenger. It is saved in this variable.
618 : */
619 2 : messenger::pointer_t g_messenger;
620 :
621 :
622 : /** \brief The messenger initialization.
623 : *
624 : * The messenger is a connection to the snapcommunicator server.
625 : *
626 : * In most cases we receive STOP and LOG messages from it. We implement
627 : * a few other messages too (HELP, READY...)
628 : *
629 : * We use a permanent connection so if the snapcommunicator restarts
630 : * for whatever reason, we reconnect automatically.
631 : *
632 : * \param[in] sb The snap backend server we are listening for.
633 : * \param[in] action The action for which this messenger is created, it is
634 : * sent to the snapcommunicator server when we REGISTER.
635 : * \param[in] addr The address to connect to. Most often it is 127.0.0.1.
636 : * \param[in] port The port to listen on (4040).
637 : */
638 0 : messenger::messenger(snap_backend * sb, QString const & action, std::string const & addr, int port)
639 : : snap_tcp_client_permanent_message_connection(
640 : addr,
641 : port,
642 : tcp_client_server::bio_client::mode_t::MODE_PLAIN,
643 : snap_communicator::snap_tcp_client_permanent_message_connection::DEFAULT_PAUSE_BEFORE_RECONNECTING,
644 : false) // do not use a separate thread, we do many fork()'s
645 : , f_snap_backend(sb)
646 0 : , f_action(action)
647 : {
648 0 : set_name("snap_backend messenger");
649 0 : }
650 :
651 :
652 : /** \brief Pass messages to the Snap Backend.
653 : *
654 : * This callback is called whenever a message is received from
655 : * Snap! Communicator. The message is immediately forwarded to the
656 : * snap_backend object which is expected to process it and reply
657 : * if required.
658 : *
659 : * \param[in] message The message we just received.
660 : */
661 0 : void messenger::process_message(snap::snap_communicator_message const & message)
662 : {
663 0 : f_snap_backend->process_message(message);
664 0 : }
665 :
666 :
667 : /** \brief The messenger could not connect to snapcommunicator.
668 : *
669 : * This function is called whenever the messengers fails to
670 : * connect to the snapcommunicator server. This could be
671 : * because snapcommunicator is not running or because the
672 : * information for the snapbackend is wrong...
673 : *
674 : * Note that it is not abnormal as snapcommunicator may not
675 : * have been started yet when snapdbproxy is started. This
676 : * is okay because we have a messenger system that is resilient.
677 : * However, in normal circumstances, this error should very
678 : * rarely if ever happen.
679 : *
680 : * \param[in] error_message An error message.
681 : */
682 0 : void messenger::process_connection_failed(std::string const & error_message)
683 : {
684 0 : SNAP_LOG_ERROR("connection to snapcommunicator failed (")(error_message)(")");
685 :
686 : // also call the default function, just in case
687 0 : snap_tcp_client_permanent_message_connection::process_connection_failed(error_message);
688 :
689 0 : f_snap_backend->process_connection_failed();
690 0 : }
691 :
692 :
693 : /** \brief The connection was established with Snap! Communicator.
694 : *
695 : * Whenever the connection is establied with the Snap! Communicator,
696 : * this callback function is called.
697 : *
698 : * The messenger reacts by REGISTERing the snap_backend with the Snap!
699 : * Communicator. The name of the backend is taken from the action
700 : * it was called with.
701 : */
702 0 : void messenger::process_connected()
703 : {
704 0 : snap_tcp_client_permanent_message_connection::process_connected();
705 :
706 0 : QString action(f_action);
707 0 : int const pos(action.indexOf(':'));
708 0 : if(pos >= 0)
709 : {
710 0 : action = action.mid(pos + 2);
711 : }
712 :
713 0 : snap::snap_communicator_message register_backend;
714 0 : register_backend.set_command("REGISTER");
715 0 : register_backend.add_parameter("service", action);
716 0 : register_backend.add_parameter("version", snap::snap_communicator::VERSION);
717 0 : send_message(register_backend);
718 0 : }
719 :
720 :
721 :
722 : /** \brief A connection between the parent process and child.
723 : *
724 : * Whenever we fork() we want to keep a live connection between the
725 : * parent and the child. We use a Unix pair of socket for the purpose
726 : * which is implemented with the snap_pipe_message_connection class.
727 : */
728 : class child_connection
729 : : public snap_communicator::snap_pipe_message_connection
730 : {
731 : public:
732 : typedef std::shared_ptr<child_connection> pointer_t;
733 :
734 : child_connection(snap_backend * sb);
735 : child_connection(child_connection const & rhs) = delete;
736 0 : virtual ~child_connection() override {}
737 :
738 : child_connection & operator = (child_connection const & rhs) = delete;
739 :
740 : bool lock(QString const & uri);
741 : void unlock();
742 :
743 : // snap::snap_communicator::snap_pipe_message_connection implementation
744 : virtual void process_message(snap_communicator_message const & message) override;
745 :
746 : private:
747 : snap_backend * f_snap_backend = nullptr;
748 : snap_lock::pointer_t f_lock = snap_lock::pointer_t();
749 : };
750 :
751 :
752 : /** \brief The currently active child connection.
753 : *
754 : * Whenever we are ready to fork() a child to run a backend, we create
755 : * one of these child_connection object and save it in this variable.
756 : *
757 : * Once the child process dies, we remove the g_child_connection from
758 : * the g_communicator. When we create a new child (fork() again), we
759 : * create a new child connection since the old one will not be valid
760 : * anymore.
761 : */
762 2 : child_connection::pointer_t g_child_connection;
763 :
764 :
765 : /** \brief Initialize the child_connection object.
766 : *
767 : * This function is used to finish creating a child connection.
768 : *
769 : * \param[in] sb The snap backend pointer.
770 : * \param[in] uri The URI the child process is going to work on.
771 : */
772 0 : child_connection::child_connection(snap_backend * sb)
773 0 : : f_snap_backend(sb)
774 : {
775 0 : set_name("child connection");
776 0 : }
777 :
778 :
779 : /** \brief Lock the child website.
780 : *
781 : * While working on a certain website we want to lock it so
782 : * only one computer backend can work on that specific website
783 : * at a time.
784 : *
785 : * The URI of the website was specified on the constructor.
786 : *
787 : * \todo
788 : * At this time, because many of the snapcommunicator variables
789 : * are global variables, the child is affected (i.e. when it
790 : * calls exit() it wants to clean those global variables and
791 : * we may get some "weird" side effects--one of which is the
792 : * f_lock, since it sends the UNLOCK command to the snaplock
793 : * tool twice as a result.) We may want to look into completely
794 : * removing the use of global variables. I have done so in a
795 : * couple of tools (under src/) and it worked nicely.
796 : *
797 : * \return true if the lock succeeded.
798 : */
799 0 : bool child_connection::lock(QString const & uri)
800 : {
801 : // if the lock fails, it returns false; note that we want to get a 4h
802 : // lock, but we wait at most the default (5 sec.) to obtain the lock
803 : //
804 0 : f_lock.reset(new snap_lock(QString()));
805 0 : return f_lock->lock(QString("*backend* %1").arg(uri), 4 * 60 * 60);
806 : }
807 :
808 :
809 : /** \brief This function unlocks the child connection.
810 : *
811 : * This function is called whenever the child becomes a zombie.
812 : * Although the destructor would have a similar effect, we cannot
813 : * hope to get the destructor in time (i.e. a copy of the connection
814 : * shared pointer is held by the snapcommunicator and it will be
815 : * until we return from all the message processing functions.)
816 : */
817 0 : void child_connection::unlock()
818 : {
819 0 : f_lock.reset();
820 0 : }
821 :
822 :
823 : /** \brief The child sent us a message, process it.
824 : *
825 : * This callback is called whenever the child sends us a message.
826 : * In most cases this is to tell us that it is done with a date
827 : * when it wants to be awaken again.
828 : *
829 : * \note
830 : * At this point I do not foresee any reason for the child to
831 : * send us any messages. The connection is really for the parent
832 : * process (snap_backend) to be able to send a STOP message to
833 : * the child.
834 : *
835 : * \param[in] message The message we just received.
836 : */
837 0 : void child_connection::process_message(snap_communicator_message const & message)
838 : {
839 0 : f_snap_backend->process_child_message(message);
840 0 : }
841 :
842 :
843 :
844 :
845 : } // no name namespace
846 :
847 :
848 :
849 0 : snap_backend::snap_backend( server_pointer_t s )
850 : : snap_child(s)
851 0 : , f_parent_pid(getpid())
852 : {
853 0 : }
854 :
855 :
856 : /** \brief Clean up the snap backend.
857 : *
858 : * This function is used to do some clean up of the snap backend environment.
859 : */
860 0 : snap_backend::~snap_backend()
861 : {
862 : // TBD: should we make sure to delete all the global objects?
863 : // as far as I know they should already be cleared up since
864 : // we cannot exit the run() loop unless there were...
865 0 : }
866 :
867 :
868 : /** \brief Check whether the STOP signal was received.
869 : *
870 : * This function checks whether the parent snap_backend process
871 : * sent us a STOP message. If so the function returns true and
872 : * you are expected to return from your backend as soon as possible.
873 : *
874 : * \todo
875 : * We may eventually want to receive other messages, not just STOP.
876 : * However, at this point I do not see the need. If we want other
877 : * messages, we should transform this function in a pop_message()
878 : * which returns true if a message is indeed popped. Once the
879 : * STOP is received, only the STOP can be popped and it will be
880 : * popped as many times as the function gets called.
881 : *
882 : * \return true if the backend thread received the STOP signal.
883 : */
884 0 : bool snap_backend::stop_received() const
885 : {
886 0 : if(getpid() == f_parent_pid)
887 : {
888 0 : throw snap_logic_exception("snap_backend::get_error(): Function called from the parent process. It can only be used from the child.");
889 : }
890 :
891 : // make sure to process any pending messages
892 : //
893 : // Note: we definitively are in the child process, so the
894 : // g_child_connection exists
895 : //
896 0 : g_child_connection->process_read();
897 :
898 0 : return f_stop_received;
899 : }
900 :
901 :
902 : /** \brief Add a website URI to process on 'date'.
903 : *
904 : * This function is used to add the URI of a website that needs to be
905 : * processed on a certain date. The URIs are first organized by actions
906 : * and then by date.
907 : *
908 : * \warning
909 : * The action MUST include the namespace. So if you call from a plugin
910 : * named "list", for example, the action name must start with "list::"
911 : * as in "list::pagelist". Otherwise it will not match the f_action
912 : * parameter used in other places and the data will be ignored.
913 : *
914 : * \param[in] action The action concerned by this.
915 : * \param[in] date The date when this action should next be applied.
916 : * \param[in] website_uri The URI of the website on which the \p action should
917 : * be applied on \p date.
918 : */
919 0 : bool snap_backend::add_uri_for_processing(QString const & action, int64_t date, QString const & website_uri)
920 : {
921 : try
922 : {
923 0 : QString const action_reference(QString("*%1*").arg(action));
924 0 : int64_t const previous_entry(f_backend_table->getRow(action_reference)->getCell(website_uri)->getValue().safeInt64Value(0, -1));
925 0 : if(previous_entry != -1)
926 : {
927 0 : QByteArray column_key;
928 0 : libdbproxy::appendInt64Value(column_key, previous_entry);
929 :
930 : // is entry already outdated and thus still effective?
931 : //
932 0 : if(previous_entry <= date)
933 : {
934 : // make sure there is indeed an entry though because bugs
935 : // creep in and the other cell may not be in place anymore
936 : // and a "return" here would prevent further work on any
937 : // backend processing
938 : //
939 0 : if(f_backend_table->getRow(action)->exists(column_key))
940 : {
941 : // we already have that entry at the same date or earlier
942 : //
943 0 : return true;
944 : }
945 : }
946 :
947 : // make sure we drop the other reference to avoid
948 : // (generally useless) duplicates
949 : //
950 0 : f_backend_table->getRow(action)->dropCell(column_key);
951 : }
952 :
953 0 : QByteArray date_key;
954 0 : libdbproxy::appendInt64Value(date_key, date);
955 0 : f_backend_table->getRow(action)->getCell(date_key)->setValue(website_uri);
956 :
957 : // save a reference so we can drop the entry as required
958 : //
959 0 : f_backend_table->getRow(action_reference)->getCell(website_uri)->setValue(date);
960 :
961 0 : return true;
962 : }
963 0 : catch(std::exception const & e)
964 : {
965 0 : SNAP_LOG_WARNING("Got an exception while adding a URI for processing: ")(e.what());
966 :
967 : // pause for 30 seconds, then we will try again
968 : //
969 0 : request_cassandra_status();
970 :
971 0 : return false;
972 : }
973 : }
974 :
975 :
976 : /** \brief Remove a URI once it was processed.
977 : *
978 : * This function removes the specified website URI from the backend table.
979 : * The function makes use of the reference we save there.
980 : *
981 : * The function is called when the child process the specified URI dies.
982 : * Also, if the website is not ready when we are, we remove the URI from
983 : * the list since there is no need to have it there. It will be re-added
984 : * when we get a PING or within five minutes.
985 : *
986 : * \param[in] action The action where a website URI is to be removed.
987 : * \param[in] key The key to drop.
988 : * \param[in] website_uri The URI to be removed.
989 : *
990 : * \return true if the removal worked as expected; false if we lose the
991 : * connection to the database in the process.
992 : */
993 0 : bool snap_backend::remove_processed_uri(QString const & action, QByteArray const & key, QString const & website_uri)
994 : {
995 : try
996 : {
997 0 : QString const action_reference(QString("*%1*").arg(action));
998 0 : int64_t const previous_entry(f_backend_table->getRow(action_reference)->getCell(website_uri)->getValue().safeInt64Value(0, -1));
999 0 : if(previous_entry != -1)
1000 : {
1001 : // drop the actual entry and the reference
1002 0 : QByteArray column_key;
1003 0 : libdbproxy::appendInt64Value(column_key, previous_entry);
1004 0 : f_backend_table->getRow(action)->dropCell(column_key);
1005 : }
1006 :
1007 : // just in case, alway sforce a drop on this cell (it should not
1008 : // exist if previous_entry was -1)
1009 : //
1010 0 : f_backend_table->getRow(action_reference)->dropCell(website_uri);
1011 :
1012 : // also remove the processed entry (which is the one we really use
1013 : // to find what has to be worked on)
1014 : //
1015 0 : f_backend_table->getRow(action)->dropCell(key);
1016 :
1017 0 : return true;
1018 : }
1019 0 : catch(std::exception const & e)
1020 : {
1021 0 : SNAP_LOG_WARNING("Got an exception while adding a URI for processing: ")(e.what());
1022 :
1023 : // pause for 30 seconds, then we will try again
1024 : //
1025 0 : request_cassandra_status();
1026 :
1027 0 : return false;
1028 : }
1029 : }
1030 :
1031 :
1032 : /** \brief Execute the backend processes after initialization.
1033 : *
1034 : * This function is somewhat similar to the process() function. It is used
1035 : * to ready the server and then run the backend processes by sending a
1036 : * signal.
1037 : */
1038 0 : void snap_backend::run_backend()
1039 : {
1040 : // TBD: the calling main() function already has a try/catch, we could
1041 : // remove this one?
1042 : //
1043 : try
1044 : {
1045 0 : process_action();
1046 :
1047 0 : SNAP_LOG_INFO("snap_backend::run_backend(): exiting normally.");
1048 :
1049 : // return normally if no exception occurred
1050 : //
1051 0 : return;
1052 : }
1053 0 : catch( snap_exception const & e )
1054 : {
1055 0 : SNAP_LOG_FATAL("snap_backend::run_backend(): snap_exception caught: ")(e.what());
1056 : }
1057 0 : catch( std::exception const & e )
1058 : {
1059 0 : SNAP_LOG_FATAL("snap_backend::run_backend(): std::exception caught: ")(e.what());
1060 : }
1061 0 : catch( ... )
1062 : {
1063 0 : SNAP_LOG_FATAL("snap_backend::run_backend(): unknown exception caught!");
1064 : }
1065 0 : exit(1);
1066 0 : NOTREACHED();
1067 : }
1068 :
1069 :
1070 0 : void snap_backend::process_action()
1071 : {
1072 0 : auto p_server = f_server.lock();
1073 0 : if(!p_server)
1074 : {
1075 0 : throw snap_child_exception_no_server("snap_backend::process_action(): The p_server weak pointer could not be locked");
1076 : }
1077 :
1078 0 : init_start_date();
1079 :
1080 : // somewhat fake being a child (we are not here)
1081 0 : f_is_child = true;
1082 0 : f_child_pid = getpid();
1083 0 : f_client.reset();
1084 :
1085 : // define a User-Agent for all backends
1086 : //
1087 : // TBD: should that be a parameter in the .conf file?
1088 : //
1089 0 : f_env[snap::get_name(name_t::SNAP_NAME_CORE_HTTP_USER_AGENT)] = "Snap! Backend (" SNAPWEBSITES_VERSION_STRING ")";
1090 :
1091 : // define the action and whether it is a CRON action
1092 : //
1093 0 : f_action = p_server->get_parameter("__BACKEND_ACTION");
1094 0 : if(f_action.isEmpty())
1095 : {
1096 0 : f_action = p_server->get_parameter("__BACKEND_CRON_ACTION");
1097 0 : if(f_action.isEmpty())
1098 : {
1099 : // the default action is "snapbackend", which is not a CRON
1100 : // action and runs the backend_process() signal
1101 : // (see plugins/content/backend.cpp where we do the call)
1102 : // It is part of the content plugin to avoid having to
1103 : // carry a special case all around
1104 : //
1105 0 : f_action = QString("content::%1").arg(get_name(name_t::SNAP_NAME_CORE_SNAPBACKEND));
1106 : }
1107 : else
1108 : {
1109 0 : f_cron_action = true;
1110 : }
1111 : }
1112 :
1113 : // get the URI, since it does not change over time within one
1114 : // run, we save it in a variable member
1115 : //
1116 0 : f_website = p_server->get_parameter("__BACKEND_URI");
1117 :
1118 : // check whether this action should use a global lock when running
1119 : // (this is for those actions that cannot be run simultaneously
1120 : // against more than one website at a time; i.e. the sendmail
1121 : // backend is website agnostic, for example.)
1122 : //
1123 0 : f_global_lock = !p_server->get_parameter("GLOBAL_LOCK").isEmpty();
1124 :
1125 : // get the snap_communicator singleton
1126 : //
1127 0 : g_communicator = snap_communicator::instance();
1128 :
1129 : // create a TCP messenger connected to the Snap! Communicator server
1130 : //
1131 : {
1132 0 : QString tcp_addr("127.0.0.1");
1133 0 : int tcp_port(4040);
1134 0 : snap_config parameters("snapcommunicator");
1135 0 : tcp_client_server::get_addr_port(QString(parameters["local_listen"]), tcp_addr, tcp_port, "tcp");
1136 0 : g_messenger.reset(new messenger(this, f_action, tcp_addr.toUtf8().data(), tcp_port));
1137 0 : g_communicator->add_connection(g_messenger);
1138 :
1139 0 : snap::snap_lock::initialize_snapcommunicator(tcp_addr.toUtf8().data(), tcp_port);
1140 :
1141 0 : p_server->configure_messenger_logging( g_messenger );
1142 : }
1143 :
1144 0 : g_interrupt.reset(new backend_interrupt(this));
1145 0 : g_communicator->add_connection(g_interrupt);
1146 :
1147 : // create a Cassandra timer; we use it in the "READY" and snapbackend
1148 : // is not called with a specific action (i.e. a CRON backend); if the
1149 : // timer times out, then we force an exit with a failure status
1150 : //
1151 : {
1152 0 : g_cassandra_timer.reset(new cassandra_timer(this));
1153 0 : g_communicator->add_connection(g_cassandra_timer);
1154 : }
1155 :
1156 : // create a reconnect timer; if we lose the connection to the Cassandra
1157 : // cluster (more precisely, the snapdbproxy local daemon which cuts us
1158 : // off on a throw by the libQtCassandra library), then we want to
1159 : // reconnect after a little while which is handled by this timer;
1160 : // note that the reconnect is actually sending a new CASSANDRASTATUS
1161 : // message and the rest is done as before
1162 : //
1163 : {
1164 0 : g_reconnect_timer.reset(new reconnect_timer(this));
1165 0 : g_communicator->add_connection(g_reconnect_timer);
1166 : }
1167 :
1168 : // create a tick timer; every five minutes we add work to our
1169 : // backend table which is in turn processed whenever the wake up
1170 : // timer happens
1171 : //
1172 : {
1173 0 : g_tick_timer.reset(new tick_timer(this));
1174 0 : g_communicator->add_connection(g_tick_timer);
1175 : }
1176 :
1177 : // create a wake up timer; whenever we have work to do, this timer
1178 : // is used to run the next entry at its specified date and time
1179 : //
1180 : {
1181 0 : g_wakeup_timer.reset(new wakeup_timer(this));
1182 0 : g_communicator->add_connection(g_wakeup_timer);
1183 : }
1184 :
1185 : // we want to immediately be signaled whenever a child process dies
1186 : // so we can move to work on the next one
1187 : //
1188 : {
1189 0 : g_signal_child_death.reset(new signal_child_death(this));
1190 0 : g_communicator->add_connection(g_signal_child_death);
1191 : }
1192 :
1193 0 : SNAP_LOG_INFO("------------------------------------ CRON backend ")(f_action)(" started.");
1194 :
1195 0 : p_server->server_loop_ready();
1196 :
1197 : // start our event loop
1198 : //
1199 : //SNAP_LOG_WARNING("entering run() loop with action: ")(f_action);
1200 0 : g_communicator->run();
1201 0 : }
1202 :
1203 :
1204 : /** \brief Called once on startup and then once every 5 minutes.
1205 : *
1206 : * This function is called once immediately (we set a timeout date
1207 : * of 'now' on initialization) and then once every five minutes.
1208 : * This is used for CRON actions where the backend process needs
1209 : * to be repeated once in a while to ensure proper functioning
1210 : * of the websites as a whole.
1211 : *
1212 : * \note
1213 : * The direct backend processing (snapbackend) and specific website
1214 : * backend processing (snapbackend https://snapwebsites.org/) are
1215 : * also directed here as both of these are also processed in a
1216 : * similar way.
1217 : */
1218 0 : void snap_backend::process_tick()
1219 : {
1220 : // STOP received?
1221 : //
1222 0 : if(f_stop_received)
1223 : {
1224 0 : return;
1225 : }
1226 :
1227 : // if the user gave us a specific website to process, we cannot add
1228 : // the URI to the backend table
1229 : //
1230 0 : if( f_website.isEmpty() )
1231 : {
1232 : // if the "sites" table does not even exists, then either wait
1233 : // or quit immediately
1234 : //
1235 0 : if(!is_ready(""))
1236 : {
1237 0 : if(!f_cron_action)
1238 : {
1239 : // one reason for is_ready() to not return true is if snaplock
1240 : // is not up yet
1241 : //
1242 0 : if(!f_snaplock)
1243 : {
1244 0 : SNAP_LOG_FATAL("snap_backend::process_tick(): The Snap! Lock daemon is not available.");
1245 0 : exit(1);
1246 0 : snap::NOTREACHED();
1247 : }
1248 :
1249 : // if we are connected to cassandra but are not marked ready
1250 : // that means the "sites" table is not yet defined
1251 : //
1252 0 : if(f_cassandra != nullptr)
1253 : {
1254 0 : SNAP_LOG_FATAL("snap_backend::process_tick(): The \"sites\" table does not even exist, we cannot yet run a backend action.");
1255 0 : exit(1);
1256 0 : snap::NOTREACHED();
1257 : }
1258 :
1259 : // The CRON behavior ends up here all the time because we
1260 : // now wait for the CASSANDRAREADY event before the
1261 : // is_ready() function returns true... so we have to
1262 : // wait a bit before we exit with a fatal error
1263 : //
1264 0 : ++f_not_ready_counter;
1265 0 : if(f_not_ready_counter > 3) // 3 represent a total of 30 seconds of wait at this time (see the 10 second wait below)
1266 : {
1267 0 : SNAP_LOG_FATAL("snap_backend::process_tick(): We could not connect to snapdbproxy within 30 seconds.");
1268 0 : exit(1);
1269 0 : snap::NOTREACHED();
1270 : }
1271 : }
1272 :
1273 0 : if(f_cassandra == nullptr)
1274 : {
1275 0 : SNAP_LOG_WARNING("snap_backend::process_tick(): not yet connected to snapdbproxy.");
1276 : }
1277 0 : else if(f_emit_warning_about_missing_sites)
1278 : {
1279 0 : f_emit_warning_about_missing_sites = false;
1280 :
1281 : // the whole table is still missing after 5 minutes!
1282 : // in this case it is an error instead of a fatal error
1283 0 : SNAP_LOG_WARNING("snap_backend::process_tick(): The \"sites\" table is still empty or nonexistent! Waiting before starting the \"")(f_action)("\" backend processing (a CRON action).");
1284 : }
1285 :
1286 : // TBD: keep this the way it is or use the CASSANDRAREADY signal?
1287 : //
1288 : // the website is not ready, wait another 10 seconds and try
1289 : // again; if I'm correct, this should not happen anymore with
1290 : // the current installation process...
1291 : //
1292 : // here we use the timeout date to not have to change the
1293 : // ticking clock
1294 : //
1295 : // TBD: should we instead slide the ticking clock?
1296 : //
1297 0 : g_tick_timer->set_timeout_date(snap_communicator::get_current_date() + 10LL * 1000000LL);
1298 0 : return;
1299 : }
1300 :
1301 : // make sure we reset the "not ready counter" once ready
1302 : //
1303 0 : f_not_ready_counter = 0;
1304 :
1305 : try
1306 : {
1307 : // if a site exists then it has a "core::last_updated" entry
1308 : //
1309 0 : f_sites_table->clearCache(); // just in case, make sure we do not have a query still laying around
1310 0 : auto column_predicate(std::make_shared<libdbproxy::cell_key_predicate>());
1311 0 : column_predicate->setCellKey(get_name(name_t::SNAP_NAME_CORE_LAST_UPDATED));
1312 0 : auto row_predicate(std::make_shared<libdbproxy::row_predicate>());
1313 0 : row_predicate->setCellPredicate(column_predicate);
1314 : for(;;)
1315 : {
1316 : // WARNING: at this point the f_sites_table may be NULL
1317 : // because we call add_uri_for_processing()
1318 : // and that may throw and call
1319 : // request_cassandra_status() which clears
1320 : // everything (notice that we have two for loops
1321 : // and the inner loop breaks on error instead
1322 : // of "goto exit" or something of the sort...
1323 : // because the add_uri_for_processing() may
1324 : // return false for other reasons than a throw.)
1325 : //
1326 0 : if(f_sites_table == nullptr
1327 0 : || f_sites_table->readRows(row_predicate) == 0)
1328 : {
1329 : // no more websites to process
1330 0 : break;
1331 : }
1332 :
1333 : // got some websites
1334 : //
1335 0 : libdbproxy::rows const rows(f_sites_table->getRows());
1336 0 : for(libdbproxy::rows::const_iterator it(rows.begin());
1337 0 : it != rows.end();
1338 : ++it)
1339 : {
1340 0 : QString const key(QString::fromUtf8(it.key().data()));
1341 0 : if(!add_uri_for_processing(f_action, get_current_date(), key))
1342 : {
1343 : // this happens if an error occurs while working with
1344 : // the database; in that case we cannot go any further
1345 : //
1346 0 : break;
1347 : }
1348 : }
1349 0 : }
1350 : }
1351 0 : catch(std::exception const & e)
1352 : {
1353 0 : SNAP_LOG_WARNING("Got an exception while adding a URI for processing: ")(e.what());
1354 :
1355 : // pause for 30 seconds, then we will try again
1356 : //
1357 0 : request_cassandra_status();
1358 : }
1359 : }
1360 :
1361 : // if no child is currently running, wake up the messenger ASAP
1362 : //
1363 0 : if(g_child_connection == nullptr)
1364 : {
1365 : #ifdef DEBUG
1366 0 : SNAP_LOG_TRACE("Immediately tick the wakeup_timer from the last tick timeout.");
1367 : #endif
1368 0 : g_wakeup_timer->set_timeout_date(snap_communicator::get_current_date());
1369 : }
1370 : }
1371 :
1372 :
1373 : /** \brief Timeout is called whenever a child process needs to be started.
1374 : *
1375 : * This function is called when the Snap! Communicator main messenger
1376 : * connection times out. This generally means the child process needs
1377 : * to be started with a URI.
1378 : *
1379 : * \return true if a new backend was started on this call.
1380 : */
1381 0 : bool snap_backend::process_timeout()
1382 : {
1383 : // STOP received?
1384 : // Child still running? (our timer should never be on when we have
1385 : // a child running, but it is way safer this way)
1386 : //
1387 0 : if(f_stop_received
1388 0 : || g_child_connection != nullptr)
1389 : {
1390 0 : return false;
1391 : }
1392 :
1393 0 : if( f_website.isEmpty() )
1394 : {
1395 : // if we reach here f_sites_tables and f_backend_table should
1396 : // both be defined, but just in case since we have rather lose
1397 : // event agreggations...
1398 : //
1399 0 : if(f_sites_table == nullptr
1400 0 : || f_backend_table == nullptr)
1401 : {
1402 0 : return false;
1403 : }
1404 :
1405 : // the connection to snapdbproxy may be severed while attempting
1406 : // to read more data; here we do a try catch so we can have a
1407 : // pause and attempt to reconnect later (30 seconds later)
1408 : //
1409 : // See SNAP-529 for details
1410 : //
1411 : try
1412 : {
1413 : // if the user did not give us a specific website to work on
1414 : // we want to check for the next entry in our backend table
1415 : //
1416 0 : libdbproxy::row::pointer_t row(f_backend_table->getRow(f_action));
1417 0 : row->clearCache(); // just in case, make sure we do not have a query laying around
1418 0 : row->setTimeout(60LL * 1000LL); // wait up to 1 min. to load the cells
1419 0 : auto column_predicate(std::make_shared<libdbproxy::cell_range_predicate>());
1420 0 : column_predicate->setCount(1); // read only the first row -- WARNING: if you increase that number you MUST add a sub-loop
1421 0 : column_predicate->setIndex(); // behave like an index
1422 : for(;;)
1423 : {
1424 0 : row->readCells(column_predicate);
1425 0 : libdbproxy::cells const cells(row->getCells());
1426 0 : if(cells.isEmpty())
1427 : {
1428 : // it looks like we are done
1429 0 : break;
1430 : }
1431 :
1432 : // check whether the time is past, if it is in more than 10ms
1433 : // then we want to go to sleep again, otherwise we start
1434 : // processing that website now
1435 : //
1436 0 : QByteArray const key(cells.begin().key());
1437 0 : int64_t const time_limit(libdbproxy::safeInt64Value(key, 0, 0));
1438 0 : if(time_limit <= get_current_date() + 10000LL)
1439 : {
1440 : // note how we remove the URI from the backend table before
1441 : // we processed it: this is much safer, if that website
1442 : // (currently) has a problem, then we just end up skipping
1443 : // it and we will just try again later.
1444 : //
1445 0 : libdbproxy::cell::pointer_t cell(*cells.begin());
1446 0 : QString const website_uri(cell->getValue().stringValue());
1447 0 : remove_processed_uri(f_action, key, website_uri);
1448 0 : if(process_backend_uri(website_uri))
1449 : {
1450 0 : return true;
1451 : }
1452 : }
1453 : else
1454 : {
1455 : // we found one that needs to be started in the future
1456 : // we can exit the loop now after we stamped the timer
1457 : // for when we want to wake up next
1458 : //
1459 0 : g_wakeup_timer->set_timeout_date(time_limit);
1460 0 : break;
1461 : }
1462 0 : }
1463 : }
1464 0 : catch(std::exception const & e)
1465 : {
1466 0 : SNAP_LOG_WARNING("Got an exception while searching for the next website to work on: ")(e.what());
1467 :
1468 : // pause for 30 seconds, then we will try again
1469 : //
1470 0 : request_cassandra_status();
1471 : }
1472 : }
1473 : else
1474 : {
1475 0 : process_backend_uri(f_website);
1476 0 : return true;
1477 : }
1478 :
1479 0 : return false;
1480 : }
1481 :
1482 :
1483 : /** \brief Process a message received from Snap! Communicator.
1484 : *
1485 : * This function gets called whenever the Snap! Communicator sends
1486 : * us a message. This includes the READY and HELP commands, although
1487 : * the most important one is certainly the STOP command used to request
1488 : * this process to STOP as soon as possible.
1489 : *
1490 : * \param[in] message The message we just received.
1491 : */
1492 0 : void snap_backend::process_message(snap::snap_communicator_message const & message)
1493 : {
1494 0 : QString const command(message.get_command());
1495 :
1496 : // STATUS is sent too many times, so do not trace them all...
1497 0 : if(command != "STATUS")
1498 : {
1499 0 : SNAP_LOG_TRACE("received messenger message [")(message.to_message())("] for ")(f_action);
1500 : }
1501 :
1502 0 : if(command == "PING")
1503 : {
1504 : // only CRON actions accept PING messages
1505 : //
1506 0 : if(!f_cron_action)
1507 : {
1508 0 : SNAP_LOG_ERROR("PING sent to a backend which is not a CRON action. PING will be ignored.");
1509 0 : return;
1510 : }
1511 :
1512 : // someone is asking us to restart the child for a specific URI
1513 : //
1514 0 : QString uri;
1515 0 : if(message.has_parameter("uri"))
1516 : {
1517 0 : uri = message.get_parameter("uri");
1518 : }
1519 0 : if(uri.isEmpty())
1520 : {
1521 0 : SNAP_LOG_ERROR("PING sent to \"")(f_action)("\" backend without a URI. PING will be ignored.");
1522 0 : return;
1523 : }
1524 :
1525 0 : if(f_website.isEmpty()
1526 0 : && is_ready(""))
1527 : {
1528 0 : if(add_uri_for_processing(f_action, get_current_date(), uri))
1529 : {
1530 : // if no child is currently running, wake up the messenger ASAP
1531 : //
1532 0 : if(g_child_connection == nullptr)
1533 : {
1534 : #ifdef DEBUG
1535 0 : SNAP_LOG_TRACE("Run the child now since it was not running.");
1536 : #endif
1537 0 : g_wakeup_timer->set_timeout_date(snap_communicator::get_current_date());
1538 : }
1539 : }
1540 : }
1541 : else
1542 : {
1543 0 : f_pinged = true;
1544 : }
1545 0 : return;
1546 : }
1547 :
1548 0 : if(command == "LOG")
1549 : {
1550 : // logrotate just rotated the logs, we have to reconfigure
1551 : //
1552 0 : SNAP_LOG_INFO("Logging reconfiguration.");
1553 0 : logging::reconfigure();
1554 0 : return;
1555 : }
1556 :
1557 0 : if(command == "STOP")
1558 : {
1559 : // Someone is asking us to leave
1560 : //
1561 0 : stop(false);
1562 0 : return;
1563 : }
1564 0 : if(command == "QUITTING")
1565 : {
1566 : // If we received the QUITTING command, then somehow we sent
1567 : // a message to Snap! Communicator, which is already in the
1568 : // process of quitting... we should get a STOP too, but we
1569 : // can just quit ASAP too
1570 : //
1571 0 : stop(true);
1572 0 : return;
1573 : }
1574 :
1575 0 : if(command == "READY")
1576 : {
1577 : // Snap! Communicator received our REGISTER command
1578 : //
1579 :
1580 : // if the user called snapbackend as the CRON action (i.e. no --action
1581 : // specified on the command line) then we want to be able to time out
1582 : // if snapdbproxy never sends us a CASSANDRAREADY message
1583 : //
1584 0 : if(!f_cron_action)
1585 : {
1586 0 : g_cassandra_timer->set_enable(true);
1587 0 : g_cassandra_timer->set_timeout_date(snap_communicator::get_current_date() + cassandra_timer::MAX_START_INTERVAL);
1588 : }
1589 :
1590 : // request snapdbproxy to send us a status signal about
1591 : // Cassandra, after that one call, we will receive the
1592 : // statuses just because we understand them.
1593 : //
1594 0 : process_reconnect(); // simulate a process_reconnect() timeout
1595 :
1596 : // request snapcommunicator to send us a STATUS message
1597 : // about the current status of the snaplock service
1598 : //
1599 0 : snap::snap_communicator_message islockready_message;
1600 0 : islockready_message.set_command("SERVICESTATUS");
1601 0 : islockready_message.add_parameter("service", "snaplock");
1602 0 : g_messenger->send_message(islockready_message);
1603 :
1604 0 : return;
1605 : }
1606 :
1607 0 : if(command == "NOCASSANDRA")
1608 : {
1609 : // we lost Cassandra, disconnect from snapdbproxy until we
1610 : // get CASSANDRAREADY again
1611 : //
1612 0 : f_auto_retry_cassandra = false;
1613 0 : disconnect_cassandra();
1614 :
1615 0 : return;
1616 : }
1617 :
1618 0 : if(command == "CASSANDRAREADY")
1619 : {
1620 : // cancel timeouts
1621 : //
1622 0 : if(!f_cron_action
1623 0 : && g_cassandra_timer != nullptr)
1624 : {
1625 0 : g_cassandra_timer->set_enable(false);
1626 : }
1627 0 : if(g_reconnect_timer != nullptr)
1628 : {
1629 : // WARNING: this one we do not disable, instead we avoid the
1630 : // timeout by setting the date to -1
1631 : //
1632 : // because the CASSANDRAREADY message can happen
1633 : // back to back, the timer can be started at this
1634 : // point; this can happens on startup when
1635 : // snapdbproxy broadcasts its CASSANDRAREADY message
1636 : // and the snapbackend process already sent a
1637 : // CASSANDRASTATUS message and the first CASSANDRAREADY
1638 : // message processing ended up with an error
1639 : //
1640 0 : g_reconnect_timer->set_timeout_date(-1);
1641 : }
1642 :
1643 : // connect to Cassandra
1644 : //
1645 : // IMPORTANT NOTE: We are likely to receive two of these events
1646 : // in a raw (i.e. the broadcast version and the
1647 : // one specifically sent to this or that
1648 : // specifically running backend)
1649 : //
1650 0 : f_auto_retry_cassandra = true;
1651 0 : if(!connect_cassandra(false))
1652 : {
1653 0 : SNAP_LOG_WARNING("snapwebsites failed to connect to snapdbproxy (process_message())");
1654 :
1655 0 : disconnect_cassandra();
1656 : }
1657 0 : else if(g_tick_timer != nullptr && !f_stop_received)
1658 : {
1659 : // we are now ready to try running a child process
1660 : //
1661 0 : g_tick_timer->set_enable(true);
1662 0 : g_tick_timer->set_timeout_date(snap_communicator::get_current_date());
1663 : }
1664 :
1665 0 : return;
1666 : }
1667 :
1668 0 : if(command == "STATUS")
1669 : {
1670 0 : if(message.get_parameter("service") == "snaplock")
1671 : {
1672 : // show the one STATUS that we manage here
1673 : //
1674 0 : SNAP_LOG_TRACE("received messenger message [")(message.to_message())("] for ")(f_action);
1675 :
1676 0 : f_snaplock = message.has_parameter("status")
1677 0 : && message.get_parameter("status") == "up";
1678 : }
1679 : // else -- ignore all others
1680 :
1681 0 : return;
1682 : }
1683 :
1684 0 : if(command == "HELP")
1685 : {
1686 : // Snap! Communicator is asking us about the commands that we support
1687 : //
1688 0 : snap::snap_communicator_message reply;
1689 0 : reply.set_command("COMMANDS");
1690 :
1691 : // list of commands understood by service
1692 : //
1693 0 : reply.add_parameter("list", "CASSANDRAREADY,HELP,LOG,NOCASSANDRA,PING,QUITTING,READY,STATUS,STOP,UNKNOWN");
1694 :
1695 0 : g_messenger->send_message(reply);
1696 0 : return;
1697 : }
1698 :
1699 0 : if(command == "UNKNOWN")
1700 : {
1701 : // we sent a command that Snap! Communicator did not understand
1702 : //
1703 0 : SNAP_LOG_ERROR("we sent unknown command \"")(message.get_parameter("command"))("\" and probably did not get the expected result.");
1704 0 : return;
1705 : }
1706 :
1707 : // unknown command is reported and process goes on
1708 : //
1709 0 : SNAP_LOG_ERROR("unsupported command \"")(command)("\" was received on the connection with Snap! Communicator.");
1710 : {
1711 0 : snap::snap_communicator_message reply;
1712 0 : reply.set_command("UNKNOWN");
1713 0 : reply.add_parameter("command", command);
1714 0 : g_messenger->send_message(reply);
1715 : }
1716 :
1717 0 : return;
1718 : }
1719 :
1720 :
1721 0 : void snap_backend::disconnect_cassandra()
1722 : {
1723 : // we are in control of the backend table
1724 : //
1725 0 : f_backend_table.reset();
1726 :
1727 : // we have our own f_sites_table variable
1728 : // (TBD: maybe we could share the snap_child one? right now it is private.)
1729 : //
1730 0 : f_sites_table.reset();
1731 :
1732 : // the disconnect_cassandra() in snap_child already takes care of
1733 : //
1734 : // f_sites_table
1735 : // f_context
1736 : // f_cassandra
1737 : //
1738 0 : snap_child::disconnect_cassandra();
1739 0 : }
1740 :
1741 :
1742 0 : void snap_backend::request_cassandra_status()
1743 : {
1744 0 : SNAP_LOG_TRACE("requesting a CASSANDRASTATUS message because we got an error from our connection with snapdbproxy");
1745 :
1746 : // since we are going to disconnect, there is no need for this timer
1747 : // so we can as well disable it; it will be re-enabled when we
1748 : // receive the CASSANDRAREADY message
1749 : //
1750 0 : g_tick_timer->set_enable(false);
1751 :
1752 : // make sure the rest of the class knows that the current state
1753 : // is viewed as "not good"--okay, just kidding, it is viewed as
1754 : // not connected to Cassandra so we need to reset the pointers
1755 : //
1756 : // also we do not want an auto-retry in case the snapdbproxy is
1757 : // really not available
1758 : //
1759 0 : f_auto_retry_cassandra = false;
1760 0 : disconnect_cassandra();
1761 :
1762 : // whether the user wants to request a new CASSANDRASTATUS to be sent
1763 : //
1764 0 : SNAP_LOG_WARNING("get ready for a reconnect in 30 seconds");
1765 0 : int64_t const now(snap::snap_communicator::get_current_date());
1766 0 : int64_t const reconnect_date(now + 30LL * 1000000LL);
1767 0 : g_reconnect_timer->set_timeout_date(reconnect_date);
1768 0 : }
1769 :
1770 :
1771 0 : void snap_backend::process_reconnect()
1772 : {
1773 0 : SNAP_LOG_TRACE("sending the CASSANDRASTATUS message");
1774 :
1775 0 : snap::snap_communicator_message isdbready_message;
1776 0 : isdbready_message.set_command("CASSANDRASTATUS");
1777 0 : isdbready_message.set_service("snapdbproxy");
1778 0 : g_messenger->send_message(isdbready_message);
1779 0 : }
1780 :
1781 :
1782 : /** \brief Called whenever we receive the STOP command or equivalent.
1783 : *
1784 : * This function makes sure the snapbackend exits as quickly as
1785 : * possible.
1786 : *
1787 : * \li Marks the messenger as done.
1788 : * \li Disabled wake up and tick timers.
1789 : * \li UNREGISTER from snapcommunicator.
1790 : * \li STOP child if one is currently running.
1791 : * \li Remove timers and child death connections if no child is running.
1792 : *
1793 : * \note
1794 : * If the g_messenger is still in place, then just sending the
1795 : * UNREGISTER is enough to quit normally. The socket of the
1796 : * g_messenger will be closed by the snapcommunicator server
1797 : * and we will get a HUP signal. However, we get the HUP only
1798 : * because we first mark the messenger as done.
1799 : *
1800 : * \param[in] quitting Set to true if we received a QUITTING message.
1801 : */
1802 0 : void snap_backend::stop(bool quitting)
1803 : {
1804 0 : f_stop_received = true;
1805 :
1806 : // stop the timers immediately, although that will not prevent
1807 : // one more call to their callbacks which thus still have to
1808 : // check the f_stop_received flag
1809 : //
1810 0 : if(g_cassandra_timer != nullptr)
1811 : {
1812 0 : g_cassandra_timer->set_enable(false);
1813 0 : g_cassandra_timer->set_timeout_date(-1);
1814 : }
1815 0 : if(g_reconnect_timer != nullptr)
1816 : {
1817 0 : g_reconnect_timer->set_enable(false);
1818 0 : g_reconnect_timer->set_timeout_date(-1);
1819 : }
1820 0 : if(g_tick_timer != nullptr)
1821 : {
1822 0 : g_tick_timer->set_enable(false);
1823 0 : g_tick_timer->set_timeout_delay(-1);
1824 : }
1825 0 : if(g_wakeup_timer != nullptr)
1826 : {
1827 0 : g_wakeup_timer->set_enable(false);
1828 0 : g_wakeup_timer->set_timeout_date(-1);
1829 : }
1830 :
1831 0 : if(g_messenger != nullptr)
1832 : {
1833 0 : if(quitting || !g_messenger->is_connected())
1834 : {
1835 : // turn off that connection now, we cannot UNREGISTER since
1836 : // we are not connected to snapcommunicator
1837 : //
1838 0 : g_communicator->remove_connection(g_messenger);
1839 0 : g_messenger.reset();
1840 : }
1841 : else
1842 : {
1843 0 : g_messenger->mark_done();
1844 :
1845 : // unregister if we are still connected to the messenger
1846 : // and Snap! Communicator is not already quitting
1847 : //
1848 0 : QString action(f_action);
1849 0 : int const pos(action.indexOf(':'));
1850 0 : if(pos >= 0)
1851 : {
1852 0 : action = action.mid(pos + 2);
1853 : }
1854 :
1855 0 : snap::snap_communicator_message cmd;
1856 0 : cmd.set_command("UNREGISTER");
1857 0 : cmd.add_parameter("service", action);
1858 0 : g_messenger->send_message(cmd);
1859 : }
1860 : }
1861 :
1862 : // if we still have a child, ask the child to quit first
1863 : //
1864 0 : if(g_child_connection != nullptr)
1865 : {
1866 : // propagate the STOP to our current child process
1867 : //
1868 0 : snap::snap_communicator_message cmd;
1869 0 : cmd.set_command("STOP");
1870 0 : g_child_connection->send_message(cmd);
1871 : }
1872 : else
1873 : {
1874 : // we don't remove the messenger here because we either already
1875 : // have done so above or sent the UNREGISTER message; in the
1876 : // second case, it will auto-hangup shortly
1877 : //
1878 : //g_communicator->remove_connection(g_messenger);
1879 :
1880 0 : g_communicator->remove_connection(g_cassandra_timer);
1881 0 : g_communicator->remove_connection(g_reconnect_timer);
1882 0 : g_communicator->remove_connection(g_tick_timer);
1883 0 : g_communicator->remove_connection(g_wakeup_timer);
1884 0 : g_communicator->remove_connection(g_signal_child_death);
1885 : }
1886 :
1887 0 : g_communicator->remove_connection(g_interrupt);
1888 0 : }
1889 :
1890 :
1891 0 : void snap_backend::process_connection_failed()
1892 : {
1893 : // if this was not called with --action, increase a counter and
1894 : // quit after a few tries
1895 : //
1896 0 : if(!f_cron_action)
1897 : {
1898 0 : ++f_error_count;
1899 0 : if(f_error_count >= 3)
1900 : {
1901 : // too many attempts, just quit
1902 : //
1903 0 : disconnect();
1904 : }
1905 : }
1906 0 : }
1907 :
1908 :
1909 :
1910 : /** \brief Process a "child" message.
1911 : *
1912 : * Whenever we have a child running, we may send and receive messages
1913 : * between the parent and child. Because the parent backend and child
1914 : * processes both use the same g_child_connection object, both end up
1915 : * calling this function to handle their messages.
1916 : *
1917 : * We distinguish the parent and child by their PID.
1918 : *
1919 : * At this time, the parent accepts no messages.
1920 : *
1921 : * The child accepts the STOP, HELP, and UNKNOWN messages. The parent
1922 : * will send a STOP to the child whenever it iself receives a STOP.
1923 : * (i.e. it propagates the STOP message.)
1924 : *
1925 : * \note
1926 : * The f_parent_pid is setup in the parent snap_backend whenever the
1927 : * object is created. It will remain the same once in the child
1928 : * process.
1929 : *
1930 : * \param[in] message The message received by the parent or the child.
1931 : */
1932 0 : void snap_backend::process_child_message(snap::snap_communicator_message const & message)
1933 : {
1934 0 : SNAP_LOG_TRACE("received child message [")(message.to_message())("] for ")(f_action);
1935 :
1936 0 : if(getpid() == f_parent_pid)
1937 : {
1938 : // parent is receiving a message
1939 : //
1940 : // ?
1941 : }
1942 : else
1943 : {
1944 : // child is receiving a message
1945 : //
1946 0 : QString const command(message.get_command());
1947 :
1948 0 : if(command == "STOP")
1949 : {
1950 0 : f_stop_received = true;
1951 0 : return;
1952 : }
1953 :
1954 0 : if(command == "HELP")
1955 : {
1956 : // return COMMANDS
1957 : //
1958 0 : snap::snap_communicator_message reply;
1959 0 : reply.set_command("COMMANDS");
1960 0 : reply.add_parameter("list", "HELP,STOP,UNKNOWN");
1961 : // we are in the child so g_child_connection exists
1962 0 : g_child_connection->send_message(reply);
1963 0 : return;
1964 : }
1965 :
1966 0 : if(command == "UNKNOWN")
1967 : {
1968 : // when we send an unknown command we get pinged back with
1969 : // the UNKNOWN message
1970 : //
1971 0 : SNAP_LOG_ERROR("we sent an unknown command \"")(message.get_parameter("command"))("\" and probably did not get the expected result.");
1972 0 : return;
1973 : }
1974 :
1975 : {
1976 : // return UNKNOWN
1977 : //
1978 0 : snap::snap_communicator_message reply;
1979 0 : reply.set_command("UNKNOWN");
1980 0 : reply.add_parameter("command", command);
1981 : // we are the in the child so g_child_connection exists
1982 0 : g_child_connection->send_message(reply);
1983 0 : return;
1984 : }
1985 : }
1986 : }
1987 :
1988 :
1989 : /** \brief This function captures the child process death signal.
1990 : *
1991 : * Whenever the child process dies, we receive this signal immediately.
1992 : * The function processes the exit status with a waitpid() call, removes
1993 : * the child connection from the communicator, and depending on whether
1994 : * it is a backend action, we proceed as follow:
1995 : *
1996 : * \li backend action -- go to sleep for 5 minutes and start the child
1997 : * process again then
1998 : * \li normal action -- disconnect from the snap communicator and
1999 : * child process and return
2000 : *
2001 : * \param[in] pid The PID of the child process that just died.
2002 : */
2003 0 : void snap_backend::capture_zombies(pid_t pid)
2004 : {
2005 : // first capture the current zombie and save its status upon death
2006 : //
2007 0 : int status(0);
2008 0 : pid_t const p(waitpid(pid, &status, 0));
2009 0 : if(p == -1)
2010 : {
2011 0 : int const e(errno);
2012 0 : SNAP_LOG_ERROR("waitpid() returned with an error (errno: ")(e)(" -- ")(strerror(e))(").");
2013 : }
2014 : else
2015 : {
2016 : // save PID and verify it here?
2017 : //if(p != f_child_pid)
2018 : //{
2019 : //}
2020 :
2021 : //SNAP_LOG_WARNING("child process (pid: ")(pid)(") for backend \"")(f_action)("\" returned.");
2022 0 : if(WIFEXITED(status))
2023 : {
2024 0 : int const exit_code(WEXITSTATUS(status));
2025 0 : if(exit_code != 0)
2026 : {
2027 0 : SNAP_LOG_ERROR("child process (pid: ")(pid)(") for backend \"")(f_action)("\" returned with an error: ")(exit_code)(".");
2028 : }
2029 : }
2030 0 : else if(WIFSIGNALED(status))
2031 : {
2032 0 : int const sig(WTERMSIG(status));
2033 0 : SNAP_LOG_ERROR("child process for backend ")(f_action)(" returned abnormally because of signal \"")(strsignal(sig))("\" (")(sig)(").");
2034 : }
2035 : else
2036 : {
2037 0 : SNAP_LOG_ERROR("child process for backend ")(f_action)(" returned abnormally.");
2038 : }
2039 : }
2040 :
2041 : // now we can forget about the child connection
2042 : //
2043 : // TBD: it looks like we could reuse that connection so we
2044 : // may want to avoid destroying and recreating the
2045 : // child connections each time, although then we need
2046 : // a separate flag to know whether a child is currently
2047 : // running or not (maybe keep its PID?) and it does not
2048 : // look like the creation is slow at all...
2049 : //
2050 : // WARNING: the g_communicator makes a copy of the connections when
2051 : // it is processing a set of events; it will be removed on
2052 : // the next loop, but here we are NOT getting a delete of
2053 : // the connection so anything we want to do here to make
2054 : // sure it is gone, we have to call a function for the
2055 : // purpose! (i.e. we want the UNLOCK to be sent now)
2056 : //
2057 0 : if(g_child_connection != nullptr)
2058 : {
2059 0 : g_child_connection->unlock();
2060 : }
2061 0 : g_communicator->remove_connection(g_child_connection);
2062 0 : g_child_connection.reset();
2063 :
2064 : // if we already received a STOP or QUITTING message, then we also
2065 : // want to get rid of the timers and child death signals
2066 : //
2067 0 : if(f_stop_received)
2068 : {
2069 0 : g_communicator->remove_connection(g_cassandra_timer);
2070 0 : g_communicator->remove_connection(g_reconnect_timer);
2071 0 : g_communicator->remove_connection(g_tick_timer);
2072 0 : g_communicator->remove_connection(g_wakeup_timer);
2073 0 : g_communicator->remove_connection(g_signal_child_death);
2074 :
2075 : // this was the last straw, now we are quitting...
2076 : //
2077 0 : return;
2078 : }
2079 :
2080 : // we may have another website to work on right now
2081 : //
2082 0 : if(f_website.isEmpty() || f_pinged)
2083 : {
2084 0 : f_pinged = false;
2085 0 : if(process_timeout())
2086 : {
2087 0 : return;
2088 : }
2089 : }
2090 :
2091 0 : if(!f_cron_action || f_action == "list")
2092 : {
2093 : // this was a "run once and quit", so we want to remove all
2094 : // the connections from the communicator and quit ourselves
2095 : //
2096 0 : disconnect();
2097 : }
2098 : }
2099 :
2100 :
2101 : /** \brief Check whether the database is ready.
2102 : *
2103 : * This function verifies that the "sites" table exists, if not, then
2104 : * the backends cannot be started safely.
2105 : *
2106 : * Further, if a URI is specified, it verifies that the specified website
2107 : * has a "core::last_updated" field defined.
2108 : *
2109 : * \todo
2110 : * We want to support a mostly automated upgrade process. See SNAP-188.
2111 : *
2112 : * \param[in] uri The domain name of a website or an empty string.
2113 : *
2114 : * \return true if the website is considered to be ready.
2115 : */
2116 0 : bool snap_backend::is_ready(QString const & uri)
2117 : {
2118 : try
2119 : {
2120 0 : if(!f_cassandra)
2121 : {
2122 0 : if(f_auto_retry_cassandra
2123 0 : && getpid() == f_parent_pid)
2124 : {
2125 : // we received the CASSANDRAREADY message, but did not
2126 : // get a valid connection yet, try again (only in the
2127 : // parent though as the child has one chance and if it
2128 : // fails it just exists)
2129 : //
2130 0 : if(!connect_cassandra(false))
2131 : {
2132 0 : SNAP_LOG_WARNING("snapwebsites failed to connect to snapdbproxy (is_ready())");
2133 :
2134 : // note that the connect_cassandra() function should already
2135 : // do a proper cleanup, but just in case...
2136 : //
2137 0 : disconnect_cassandra();
2138 0 : return false;
2139 : }
2140 : }
2141 : else
2142 : {
2143 : // we are in the NOCASSANDRA to CASSANDRAREADY window
2144 0 : return false;
2145 : }
2146 : }
2147 :
2148 0 : if(!f_snaplock)
2149 : {
2150 : // we are waiting on the "snaplock" daemon to be registered as
2151 : // a service to the "snapcommunicator"
2152 : //
2153 0 : return false;
2154 : }
2155 :
2156 0 : if(f_sites_table == nullptr)
2157 : {
2158 0 : f_context->clearCache();
2159 :
2160 : // get the "sites" table
2161 : //
2162 : // we do the findTable() here otherwise we would have to try/catch
2163 : // which is slow, not really clean or useful here...
2164 : //
2165 0 : f_sites_table = f_context->findTable(get_name(name_t::SNAP_NAME_SITES));
2166 0 : if(f_sites_table == nullptr)
2167 : {
2168 : // sites table does not even exist...
2169 : //
2170 : // we have to reset the connection otherwise we would not get the
2171 : // new context
2172 : //
2173 0 : request_cassandra_status();
2174 0 : return false;
2175 : }
2176 :
2177 : // get the "backend" table
2178 : //
2179 : // we do the findTable() here otherwise we would have to try/catch
2180 : // which is slow, not really clean or useful here...
2181 : //
2182 0 : f_backend_table = f_context->findTable(get_name(name_t::SNAP_NAME_BACKEND));
2183 0 : if(f_backend_table == nullptr)
2184 : {
2185 : // backend table does not exist...
2186 : //
2187 : // we have to reset the connection otherwise we would not get the
2188 : // new context
2189 : //
2190 0 : request_cassandra_status();
2191 0 : return false;
2192 : }
2193 : }
2194 :
2195 0 : if(uri.isEmpty())
2196 : {
2197 : // the mere existance of the sites_table is enough here
2198 : //
2199 0 : return true;
2200 : }
2201 :
2202 : // so that specific website must be considered valid
2203 : // which at this time just means having the "core::last_updated"
2204 : // field in the "sites" table
2205 : //
2206 0 : if(f_sites_table->exists(uri))
2207 : {
2208 : // TODO: to fix SNAP-125 we also want a form of lock, i.e. a parameter
2209 : // (or just a lock? but our locks are exclusive... see SNAP-470)
2210 : // that tells us that the website is being updated now...
2211 : //
2212 : // and conversely we need to know that a backend is running
2213 : // against a given website so we do not start an update while
2214 : // that is going on!
2215 : //
2216 : // with SNAP-470 we can create support for a read-only or
2217 : // read/write type of semaphore which will resolve that
2218 : // problem once and for all
2219 : //
2220 0 : return f_sites_table->getRow(uri)->exists(get_name(name_t::SNAP_NAME_CORE_LAST_UPDATED))
2221 0 : && f_sites_table->getRow(uri)->exists(get_name(name_t::SNAP_NAME_CORE_PLUGIN_THRESHOLD));
2222 : }
2223 :
2224 0 : if(!f_cron_action)
2225 : {
2226 : // the regular CRON action did not make it, just quit
2227 : //
2228 0 : SNAP_LOG_ERROR("website URI \"")(uri)("\" does not reference an existing website.");
2229 0 : disconnect();
2230 : }
2231 : }
2232 0 : catch(std::exception const & e)
2233 : {
2234 : // a problem occurred while dealing with the Cassandra cluster
2235 : // through our snapdbproxy daemon
2236 : //
2237 0 : SNAP_LOG_ERROR("is_ready() received an exception: ")(e.what());
2238 :
2239 : // pause for 30 seconds, then we will try again
2240 : //
2241 0 : request_cassandra_status();
2242 : }
2243 :
2244 0 : return false;
2245 : }
2246 :
2247 :
2248 0 : void snap_backend::disconnect()
2249 : {
2250 : // remove the connections so we end up quitting
2251 : //
2252 : // TODO: disconnecting these early generates errors we should try to fix:
2253 : // (see also SNAP-305)
2254 : //
2255 : // 2016-01-20 10:14:03 [15201]:snap_communicator.cpp:2999:halk: error: an error occurred while writing to socket of "snap_tcp_client_permanent_message_connection_impl messenger" (errno: 9 -- Bad file descriptor).
2256 : // 2016-01-20 10:14:03 [15201]:snap_communicator.cpp:1126:halk: error: socket 11 of connection "snap_tcp_client_permanent_message_connection_impl messenger" was marked as erroneous by the kernel.
2257 : //
2258 :
2259 : // this is an equivalent to a STOP message
2260 : //
2261 0 : f_stop_received = true;
2262 :
2263 0 : if(!f_cron_action && g_messenger != nullptr && g_messenger->is_connected() && f_action != "list")
2264 : {
2265 0 : g_messenger->mark_done();
2266 :
2267 : // this was the CRON action
2268 : //
2269 0 : QString action(f_action);
2270 0 : int const pos(action.indexOf(':'));
2271 0 : if(pos >= 0)
2272 : {
2273 0 : action = action.mid(pos + 2);
2274 : }
2275 :
2276 0 : snap::snap_communicator_message cmd;
2277 0 : cmd.set_command("UNREGISTER");
2278 0 : cmd.add_parameter("service", action);
2279 0 : g_messenger->send_message(cmd);
2280 :
2281 : // g_messenger will very quickly receive a HUP now
2282 : }
2283 : else
2284 : {
2285 0 : g_communicator->remove_connection(g_messenger);
2286 : }
2287 :
2288 : // now disconnect so we can quit
2289 : //
2290 0 : g_communicator->remove_connection(g_interrupt);
2291 0 : g_communicator->remove_connection(g_cassandra_timer);
2292 0 : g_communicator->remove_connection(g_reconnect_timer);
2293 0 : g_communicator->remove_connection(g_tick_timer);
2294 0 : g_communicator->remove_connection(g_wakeup_timer);
2295 0 : g_communicator->remove_connection(g_signal_child_death);
2296 0 : }
2297 :
2298 :
2299 :
2300 : /** \brief Process a backend request on the specified URI.
2301 : *
2302 : * This function is called with each URI that needs to be processed by
2303 : * the backend processes. It creates a child process that will allow
2304 : * the Cassandra data to not be shared between all instances. Instead
2305 : * each instance reads data and then drops it as the process ends.
2306 : * Since the parent blocks until the child is done, the Cassandra library
2307 : * is still only used by a single process at a time thus we avoid
2308 : * potential conflicts reading/writing on the same network connection
2309 : * (since the child inherits the parents Cassandra connection.)
2310 : *
2311 : * \note
2312 : * Note that the child is created from Cassandra, the plugins, the
2313 : * f_uri and all the resulting keys... so we gain an environment
2314 : * very similar to what we get in the server with Apache.
2315 : *
2316 : * \note
2317 : * If that site has an internal redirect then no processing is
2318 : * performed because otherwise the destination would be processed
2319 : * twice in the end.
2320 : *
2321 : * \todo
2322 : * Add necessary code to break the child if (1) the child is very long
2323 : * and (2) never contact us (i.e. watchdog signals.)
2324 : *
2325 : * \param[in] uri The URI of the site to be checked.
2326 : */
2327 0 : bool snap_backend::process_backend_uri(QString const & uri)
2328 : {
2329 : // first we verify that this very website is indeed ready to accept
2330 : // backend processes, if not return immediately
2331 : //
2332 0 : if(g_child_connection
2333 0 : || !is_ready(uri))
2334 : {
2335 0 : return false;
2336 : }
2337 :
2338 : // create a child connection so our child and us can communicate
2339 : // (especially, we can send the child a STOP if we ourselves receive
2340 : // a STOP.)
2341 : //
2342 0 : g_child_connection.reset(new child_connection(this));
2343 :
2344 : // We also lock that website while this backend process is running.
2345 : // The lock depends on the URI and the action taken.
2346 : //
2347 0 : QString lock_uri;
2348 0 : if(f_global_lock)
2349 : {
2350 : // this action can only run one instance of itself on any
2351 : // one computer in your cluster so the lock does not depend
2352 : // on the URI
2353 : //
2354 0 : lock_uri = QString("global-backend-lock#%1").arg(f_action);
2355 : }
2356 : else
2357 : {
2358 : // this action can run against multiple websites simultaneous
2359 : //
2360 0 : lock_uri = QString("%1#%2").arg(uri).arg(f_action);
2361 : }
2362 0 : if(!g_child_connection->lock(lock_uri))
2363 : {
2364 0 : g_child_connection.reset();
2365 :
2366 : // the lock failed, we cannot run against this website at this time
2367 : // (this allows us to have multiple version of the same backend
2368 : // running on various backend computers.)
2369 : //
2370 0 : SNAP_LOG_INFO("Lock in order to process website \"")(uri)("\" with action \"")(f_action)("\" failed.");
2371 :
2372 0 : return false;
2373 : }
2374 0 : g_communicator->add_connection(g_child_connection);
2375 :
2376 : // each time we restart a child we obtain a new start date
2377 : // (which is important for CRON actions)
2378 : //
2379 0 : init_start_date();
2380 :
2381 : // create a child process so the data between sites does not get
2382 : // shared (also the Cassandra data would remain in memory increasing
2383 : // the foot print each time we run a new website,) but the worst
2384 : // are the plugins; we can request a plugin to be unloaded but
2385 : // frankly the system is not very well written to handle that case.
2386 0 : pid_t const p(fork_child());
2387 0 : if(p != 0)
2388 : {
2389 : // parent process
2390 0 : if(p == -1)
2391 : {
2392 0 : int const e(errno);
2393 :
2394 : // fork() failed
2395 : //
2396 0 : g_communicator->remove_connection(g_child_connection);
2397 0 : g_child_connection.reset();
2398 :
2399 : // TODO: now that we have a snap communicator with a timer
2400 : // we could try to sleep for a while if the error
2401 : // is ENOMEM or EAGAIN
2402 : //
2403 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri() could not create a child process (errno: ")(e)(" -- ")(strerror(e))(").");
2404 :
2405 : // we do not try again, we just abandon the whole process
2406 : // (i.e. we are in the parent so the backend is quitting 100%)
2407 : //
2408 0 : exit(1);
2409 0 : NOTREACHED();
2410 : }
2411 0 : return true;
2412 : }
2413 :
2414 : // make it safe in the child process
2415 : //
2416 : try
2417 : {
2418 0 : SNAP_LOG_INFO("==================================== backend process website \"")
2419 0 : (uri)
2420 0 : ("\" with ")
2421 0 : (f_cron_action ? "cron " : "")
2422 0 : ("action \"")
2423 0 : (f_action)
2424 0 : ("\" started. (ppid: ")
2425 0 : (getppid())
2426 0 : (")");
2427 :
2428 : // make sure that Snap! Communicator is clean in the child,
2429 : // it really cannot be listening on any of these because
2430 : // that would break the parent's streams
2431 : //
2432 0 : g_communicator->remove_connection(g_messenger);
2433 0 : g_messenger.reset();
2434 0 : g_communicator->remove_connection(g_interrupt);
2435 0 : g_interrupt.reset();
2436 0 : g_communicator->remove_connection(g_cassandra_timer);
2437 0 : g_cassandra_timer.reset();
2438 0 : g_communicator->remove_connection(g_reconnect_timer);
2439 0 : g_reconnect_timer.reset();
2440 0 : g_communicator->remove_connection(g_tick_timer);
2441 0 : g_tick_timer.reset();
2442 0 : g_communicator->remove_connection(g_wakeup_timer);
2443 0 : g_wakeup_timer.reset();
2444 0 : g_communicator->remove_connection(g_signal_child_death);
2445 0 : g_signal_child_death.reset();
2446 :
2447 0 : auto p_server( f_server.lock() );
2448 0 : if(!p_server)
2449 : {
2450 0 : throw snap_logic_exception("snap_backend::process_backend_uri(): server pointer is NULL");
2451 : }
2452 :
2453 : // set the URI; if user supplied it, then it can fail!
2454 : //
2455 0 : if(!f_uri.set_uri(uri))
2456 : {
2457 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri() called with invalid URI: \"")(uri)("\", URI ignored.");
2458 0 : exit(1);
2459 0 : NOTREACHED();
2460 : }
2461 :
2462 : // cassandra re-initialization
2463 : //
2464 : // this is already done in process_action() so we have to reset the
2465 : // pointer before we can call this function again otherwise it throws
2466 : //
2467 0 : snap_expr::expr::set_cassandra_context(nullptr);
2468 0 : f_sites_table.reset();
2469 0 : f_backend_table.reset();
2470 0 : f_cassandra.reset(); // here all the remaining libdbproxy objects should all get deleted
2471 0 : NOTUSED(connect_cassandra(true)); // since we pass 'true', the function either dies or returns true
2472 :
2473 0 : if(!is_ready(uri))
2474 : {
2475 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri(): once prepared in the child, URI \"")(uri)("\" is not ready anymore.");
2476 0 : exit(1);
2477 0 : NOTREACHED();
2478 : }
2479 :
2480 : // process the f_uri parameter
2481 : //
2482 0 : canonicalize_domain();
2483 0 : canonicalize_website();
2484 0 : site_redirect();
2485 0 : if(f_site_key != f_original_site_key)
2486 : {
2487 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri() called with incorrect URI: \"")(f_site_key)("\", expected \"")(f_original_site_key)("\".");
2488 0 : exit(1);
2489 0 : NOTREACHED();
2490 : }
2491 :
2492 0 : init_plugins(true);
2493 :
2494 0 : canonicalize_options();
2495 :
2496 0 : f_ready = true;
2497 :
2498 0 : server::backend_action_set actions;
2499 0 : if(f_cron_action)
2500 : {
2501 0 : p_server->register_backend_cron(actions);
2502 : #ifdef DEBUG
2503 : // since we are in control of the content plugin, this should
2504 : // never happen...
2505 : //
2506 0 : if( actions.has_action("content::snapbackend") )
2507 : {
2508 : // the plugin HAS to be content
2509 0 : throw snap_logic_exception(QString("snap_backend::process_backend_uri(): plugin \"%1\" makes use of a CRON action named \"content::snapbackend\" which is reserved as a special action by the system.")
2510 0 : .arg(actions.get_plugin_name("content::snapbackend")));
2511 : }
2512 : // XXX: we may want to test that none of the CRON actions are
2513 : // defined as regular actions
2514 : #endif
2515 : }
2516 : else
2517 : {
2518 0 : p_server->register_backend_action(actions);
2519 : }
2520 :
2521 0 : if( actions.has_action(f_action) )
2522 : {
2523 : // this is a valid action, execute the corresponding function!
2524 : //
2525 0 : actions.execute_action(f_action);
2526 : }
2527 0 : else if( f_action == "list" )
2528 : {
2529 0 : std::cout << (f_cron_action ? "CRON " : "") << "Actions available for " << uri << std::endl;
2530 0 : actions.display();
2531 0 : std::cout << std::endl;
2532 : }
2533 : else
2534 : {
2535 0 : if(f_cron_action)
2536 : {
2537 0 : int const pos(f_action.indexOf(':'));
2538 0 : QString const namespace_name(f_action.mid(0, pos));
2539 0 : if(plugins::exists(namespace_name))
2540 : {
2541 0 : SNAP_LOG_ERROR("snap_backend::process_backend_uri(): unknown CRON action \"")
2542 0 : (f_action)
2543 0 : ("\" even with plugin \"")
2544 0 : (namespace_name)
2545 0 : ("\" installed.");
2546 0 : exit(1);
2547 0 : NOTREACHED();
2548 : }
2549 : else
2550 : {
2551 : // we do not generate an error in case a plugin is not
2552 : // installed because with many websites installed on
2553 : // the same system, all may not have all the plugins
2554 : // installed... so this is just a debug message
2555 : //
2556 0 : SNAP_LOG_DEBUG("snap_backend::process_backend_uri(): unknown CRON action \"")
2557 0 : (f_action)
2558 0 : ("\" for \"")
2559 0 : (uri)
2560 0 : ("\" (although it could be that you need to install plugin \"")
2561 0 : (namespace_name)
2562 0 : ("\" if you wanted to run that backend against this website?)");
2563 : }
2564 : }
2565 : else
2566 : {
2567 0 : SNAP_LOG_ERROR("snap_backend::process_backend_uri(): unknown action \"")
2568 0 : (f_action)
2569 0 : ("\".");
2570 0 : exit(1);
2571 0 : NOTREACHED();
2572 : }
2573 : }
2574 :
2575 : // the child process is done successfully
2576 0 : exit(0);
2577 0 : NOTREACHED();
2578 : }
2579 0 : catch( snap_exception const & except )
2580 : {
2581 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri(): snap_exception caught: ")(except.what());
2582 : }
2583 0 : catch( std::exception const & std_except )
2584 : {
2585 : // the snap_logic_exception is not a snap_exception
2586 : // and other libraries may generate other exceptions
2587 : // (i.e. libtld, C++ cassandra driver...)
2588 : //
2589 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri(): std::exception caught: ")(std_except.what());
2590 : }
2591 0 : catch( ... )
2592 : {
2593 0 : SNAP_LOG_FATAL("snap_backend::process_backend_uri(): unknown exception caught!");
2594 : }
2595 :
2596 : // the child process is done
2597 : //
2598 0 : exit(1);
2599 0 : NOTREACHED();
2600 :
2601 : // compiler expects a return
2602 : //
2603 : return false;
2604 : }
2605 :
2606 :
2607 6 : } // namespace snap
2608 : // vim: ts=4 sw=4 et
|