root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. controld_configure_fencing
  4. too_many_st_failures
  5. st_fail_count_reset
  6. st_fail_count_increment
  7. cib_fencing_updated
  8. send_stonith_update
  9. abort_for_stonith_failure
  10. add_stonith_cleanup
  11. remove_stonith_cleanup
  12. purge_stonith_cleanup
  13. execute_stonith_cleanup
  14. fail_incompletable_stonith
  15. tengine_stonith_connection_destroy
  16. handle_fence_notification
  17. controld_timer_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. controld_execute_fence_action
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/msg_xml.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/stonith-ng.h>
  15 #include <crm/fencing/internal.h>
  16 
  17 #include <pacemaker-controld.h>
  18 
  19 static void
  20 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  21 
  22 /*
  23  * stonith failure counting
  24  *
  25  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  26  * number of fencing failures for each target node, and the most we'll restart a
  27  * transition for.
  28  */
  29 
  30 struct st_fail_rec {
  31     int count;
  32 };
  33 
  34 static bool fence_reaction_panic = false;
  35 static unsigned long int stonith_max_attempts = 10;
  36 static GHashTable *stonith_failures = NULL;
  37 
  38 /*!
  39  * \internal
  40  * \brief Update max fencing attempts before giving up
  41  *
  42  * \param[in] value  New max fencing attempts
  43  */
  44 static void
  45 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  46 {
  47     stonith_max_attempts = char2score(value);
  48     if (stonith_max_attempts < 1UL) {
  49         stonith_max_attempts = 10UL;
  50     }
  51 }
  52 
  53 /*!
  54  * \internal
  55  * \brief Configure reaction to notification of local node being fenced
  56  *
  57  * \param[in] reaction_s  Reaction type
  58  */
  59 static void
  60 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  61 {
  62     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  63         fence_reaction_panic = true;
  64 
  65     } else {
  66         if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
  67             crm_warn("Invalid value '%s' for %s, using 'stop'",
  68                      reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
  69         }
  70         fence_reaction_panic = false;
  71     }
  72 }
  73 
  74 /*!
  75  * \internal
  76  * \brief Configure fencing options based on the CIB
  77  *
  78  * \param[in,out] options  Name/value pairs for configured options
  79  */
  80 void
  81 controld_configure_fencing(GHashTable *options)
     /* [previous][next][first][last][top][bottom][index][help] */
  82 {
  83     const char *value = NULL;
  84 
  85     value = g_hash_table_lookup(options, XML_CONFIG_ATTR_FENCE_REACTION);
  86     set_fence_reaction(value);
  87 
  88     value = g_hash_table_lookup(options, "stonith-max-attempts");
  89     update_stonith_max_attempts(value);
  90 }
  91 
  92 static gboolean
  93 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
  94 {
  95     GHashTableIter iter;
  96     const char *key = NULL;
  97     struct st_fail_rec *value = NULL;
  98 
  99     if (stonith_failures == NULL) {
 100         return FALSE;
 101     }
 102 
 103     if (target == NULL) {
 104         g_hash_table_iter_init(&iter, stonith_failures);
 105         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 106                (gpointer *) &value)) {
 107 
 108             if (value->count >= stonith_max_attempts) {
 109                 target = (const char*)key;
 110                 goto too_many;
 111             }
 112         }
 113     } else {
 114         value = g_hash_table_lookup(stonith_failures, target);
 115         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
 116             goto too_many;
 117         }
 118     }
 119     return FALSE;
 120 
 121 too_many:
 122     crm_warn("Too many failures (%d) to fence %s, giving up",
 123              value->count, target);
 124     return TRUE;
 125 }
 126 
 127 /*!
 128  * \internal
 129  * \brief Reset a stonith fail count
 130  *
 131  * \param[in] target  Name of node to reset, or NULL for all
 132  */
 133 void
 134 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 135 {
 136     if (stonith_failures == NULL) {
 137         return;
 138     }
 139 
 140     if (target) {
 141         struct st_fail_rec *rec = NULL;
 142 
 143         rec = g_hash_table_lookup(stonith_failures, target);
 144         if (rec) {
 145             rec->count = 0;
 146         }
 147     } else {
 148         GHashTableIter iter;
 149         const char *key = NULL;
 150         struct st_fail_rec *rec = NULL;
 151 
 152         g_hash_table_iter_init(&iter, stonith_failures);
 153         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 154                                       (gpointer *) &rec)) {
 155             rec->count = 0;
 156         }
 157     }
 158 }
 159 
 160 static void
 161 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 162 {
 163     struct st_fail_rec *rec = NULL;
 164 
 165     if (stonith_failures == NULL) {
 166         stonith_failures = pcmk__strkey_table(free, free);
 167     }
 168 
 169     rec = g_hash_table_lookup(stonith_failures, target);
 170     if (rec) {
 171         rec->count++;
 172     } else {
 173         rec = malloc(sizeof(struct st_fail_rec));
 174         if(rec == NULL) {
 175             return;
 176         }
 177 
 178         rec->count = 1;
 179         g_hash_table_insert(stonith_failures, strdup(target), rec);
 180     }
 181 }
 182 
 183 /* end stonith fail count functions */
 184 
 185 
 186 static void
 187 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 188                     void *user_data)
 189 {
 190     if (rc < pcmk_ok) {
 191         crm_err("Fencing update %d for %s: failed - %s (%d)",
 192                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 193         crm_log_xml_warn(msg, "Failed update");
 194         abort_transition(INFINITY, pcmk__graph_shutdown, "CIB update failed",
 195                          NULL);
 196 
 197     } else {
 198         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 199     }
 200 }
 201 
 202 static void
 203 send_stonith_update(pcmk__graph_action_t *action, const char *target,
     /* [previous][next][first][last][top][bottom][index][help] */
 204                     const char *uuid)
 205 {
 206     int rc = pcmk_ok;
 207     crm_node_t *peer = NULL;
 208 
 209     /* We (usually) rely on the membership layer to do node_update_cluster,
 210      * and the peer status callback to do node_update_peer, because the node
 211      * might have already rejoined before we get the stonith result here.
 212      */
 213     int flags = node_update_join | node_update_expected;
 214 
 215     /* zero out the node-status & remove all LRM status info */
 216     xmlNode *node_state = NULL;
 217 
 218     CRM_CHECK(target != NULL, return);
 219     CRM_CHECK(uuid != NULL, return);
 220 
 221     /* Make sure the membership and join caches are accurate.
 222      * Try getting any existing node cache entry also by node uuid in case it
 223      * doesn't have an uname yet.
 224      */
 225     peer = pcmk__get_peer_full(0, target, uuid, CRM_GET_PEER_ANY);
 226 
 227     CRM_CHECK(peer != NULL, return);
 228 
 229     if (peer->state == NULL) {
 230         /* Usually, we rely on the membership layer to update the cluster state
 231          * in the CIB. However, if the node has never been seen, do it here, so
 232          * the node is not considered unclean.
 233          */
 234         flags |= node_update_cluster;
 235     }
 236 
 237     if (peer->uuid == NULL) {
 238         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 239         peer->uuid = strdup(uuid);
 240     }
 241 
 242     crmd_peer_down(peer, TRUE);
 243 
 244     /* Generate a node state update for the CIB */
 245     node_state = create_node_state_update(peer, flags, NULL, __func__);
 246 
 247     /* we have to mark whether or not remote nodes have already been fenced */
 248     if (peer->flags & crm_remote_node) {
 249         char *now_s = pcmk__ttoa(time(NULL));
 250 
 251         crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
 252         free(now_s);
 253     }
 254 
 255     /* Force our known ID */
 256     crm_xml_add(node_state, XML_ATTR_ID, uuid);
 257 
 258     rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
 259                                                  XML_CIB_TAG_STATUS, node_state,
 260                                                  cib_scope_local
 261                                                  |cib_can_create);
 262 
 263     /* Delay processing the trigger until the update completes */
 264     crm_debug("Sending fencing update %d for %s", rc, target);
 265     fsa_register_cib_callback(rc, strdup(target), cib_fencing_updated);
 266 
 267     // Make sure it sticks
 268     /* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn,
 269      *                                             cib_scope_local);
 270      */
 271 
 272     controld_delete_node_state(peer->uname, controld_section_all,
 273                                cib_scope_local);
 274     free_xml(node_state);
 275     return;
 276 }
 277 
 278 /*!
 279  * \internal
 280  * \brief Abort transition due to stonith failure
 281  *
 282  * \param[in] abort_action  Whether to restart or stop transition
 283  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 284  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 285  */
 286 static void
 287 abort_for_stonith_failure(enum pcmk__graph_next abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 288                           const char *target, const xmlNode *reason)
 289 {
 290     /* If stonith repeatedly fails, we eventually give up on starting a new
 291      * transition for that reason.
 292      */
 293     if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
 294         abort_action = pcmk__graph_wait;
 295     }
 296     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
 297 }
 298 
 299 
 300 /*
 301  * stonith cleanup list
 302  *
 303  * If the DC is shot, proper notifications might not go out.
 304  * The stonith cleanup list allows the cluster to (re-)send
 305  * notifications once a new DC is elected.
 306  */
 307 
 308 static GList *stonith_cleanup_list = NULL;
 309 
 310 /*!
 311  * \internal
 312  * \brief Add a node to the stonith cleanup list
 313  *
 314  * \param[in] target  Name of node to add
 315  */
 316 void
 317 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 318     stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
 319 }
 320 
 321 /*!
 322  * \internal
 323  * \brief Remove a node from the stonith cleanup list
 324  *
 325  * \param[in] Name of node to remove
 326  */
 327 void
 328 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 329 {
 330     GList *iter = stonith_cleanup_list;
 331 
 332     while (iter != NULL) {
 333         GList *tmp = iter;
 334         char *iter_name = tmp->data;
 335 
 336         iter = iter->next;
 337         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 338             crm_trace("Removing %s from the cleanup list", iter_name);
 339             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 340             free(iter_name);
 341         }
 342     }
 343 }
 344 
 345 /*!
 346  * \internal
 347  * \brief Purge all entries from the stonith cleanup list
 348  */
 349 void
 350 purge_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 351 {
 352     if (stonith_cleanup_list) {
 353         GList *iter = NULL;
 354 
 355         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 356             char *target = iter->data;
 357 
 358             crm_info("Purging %s from stonith cleanup list", target);
 359             free(target);
 360         }
 361         g_list_free(stonith_cleanup_list);
 362         stonith_cleanup_list = NULL;
 363     }
 364 }
 365 
 366 /*!
 367  * \internal
 368  * \brief Send stonith updates for all entries in cleanup list, then purge it
 369  */
 370 void
 371 execute_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 372 {
 373     GList *iter;
 374 
 375     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 376         char *target = iter->data;
 377         crm_node_t *target_node = crm_get_peer(0, target);
 378         const char *uuid = crm_peer_uuid(target_node);
 379 
 380         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 381         send_stonith_update(NULL, target, uuid);
 382         free(target);
 383     }
 384     g_list_free(stonith_cleanup_list);
 385     stonith_cleanup_list = NULL;
 386 }
 387 
 388 /* end stonith cleanup list functions */
 389 
 390 
 391 /* stonith API client
 392  *
 393  * Functions that need to interact directly with the fencer via its API
 394  */
 395 
 396 static stonith_t *stonith_api = NULL;
 397 static mainloop_timer_t *controld_fencer_connect_timer = NULL;
 398 static char *te_client_id = NULL;
 399 
 400 static gboolean
 401 fail_incompletable_stonith(pcmk__graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 402 {
 403     GList *lpc = NULL;
 404     const char *task = NULL;
 405     xmlNode *last_action = NULL;
 406 
 407     if (graph == NULL) {
 408         return FALSE;
 409     }
 410 
 411     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 412         GList *lpc2 = NULL;
 413         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
 414 
 415         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 416             continue;
 417         }
 418 
 419         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 420             pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
 421 
 422             if ((action->type != pcmk__cluster_graph_action)
 423                 || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 424                 continue;
 425             }
 426 
 427             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 428             if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
 429                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 430                 last_action = action->xml;
 431                 pcmk__update_graph(graph, action);
 432                 crm_notice("Failing action %d (%s): fencer terminated",
 433                            action->id, ID(action->xml));
 434             }
 435         }
 436     }
 437 
 438     if (last_action != NULL) {
 439         crm_warn("Fencer failure resulted in unrunnable actions");
 440         abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
 441         return TRUE;
 442     }
 443 
 444     return FALSE;
 445 }
 446 
 447 static void
 448 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 449 {
 450     te_cleanup_stonith_history_sync(st, FALSE);
 451 
 452     if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
 453         crm_err("Lost fencer connection (will attempt to reconnect)");
 454         if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 455             mainloop_timer_start(controld_fencer_connect_timer);
 456         }
 457     } else {
 458         crm_info("Disconnected from fencer");
 459     }
 460 
 461     if (stonith_api) {
 462         /* the client API won't properly reconnect notifications
 463          * if they are still in the table - so remove them
 464          */
 465         if (stonith_api->state != stonith_disconnected) {
 466             stonith_api->cmds->disconnect(st);
 467         }
 468         stonith_api->cmds->remove_notification(stonith_api, NULL);
 469     }
 470 
 471     if (AM_I_DC) {
 472         fail_incompletable_stonith(controld_globals.transition_graph);
 473         trigger_graph();
 474     }
 475 }
 476 
 477 /*!
 478  * \internal
 479  * \brief Handle an event notification from the fencing API
 480  *
 481  * \param[in] st     Fencing API connection (ignored)
 482  * \param[in] event  Fencing API event notification
 483  */
 484 static void
 485 handle_fence_notification(stonith_t *st, stonith_event_t *event)
     /* [previous][next][first][last][top][bottom][index][help] */
 486 {
 487     bool succeeded = true;
 488     const char *executioner = "the cluster";
 489     const char *client = "a client";
 490     const char *reason = NULL;
 491     int exec_status;
 492 
 493     if (te_client_id == NULL) {
 494         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 495                                          (unsigned long) getpid());
 496     }
 497 
 498     if (event == NULL) {
 499         crm_err("Notify data not found");
 500         return;
 501     }
 502 
 503     if (event->executioner != NULL) {
 504         executioner = event->executioner;
 505     }
 506     if (event->client_origin != NULL) {
 507         client = event->client_origin;
 508     }
 509 
 510     exec_status = stonith__event_execution_status(event);
 511     if ((stonith__event_exit_status(event) != CRM_EX_OK)
 512         || (exec_status != PCMK_EXEC_DONE)) {
 513         succeeded = false;
 514         if (exec_status == PCMK_EXEC_DONE) {
 515             exec_status = PCMK_EXEC_ERROR;
 516         }
 517     }
 518     reason = stonith__event_exit_reason(event);
 519 
 520     crmd_alert_fencing_op(event);
 521 
 522     if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
 523         // Unfencing doesn't need special handling, just a log message
 524         if (succeeded) {
 525             crm_notice("%s was unfenced by %s at the request of %s@%s",
 526                        event->target, executioner, client, event->origin);
 527         } else {
 528             crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
 529                     event->target, executioner,
 530                     pcmk_exec_status_str(exec_status),
 531                     ((reason == NULL)? "" : ": "),
 532                     ((reason == NULL)? "" : reason),
 533                     stonith__event_exit_status(event));
 534         }
 535         return;
 536     }
 537 
 538     if (succeeded
 539         && pcmk__str_eq(event->target, controld_globals.our_nodename,
 540                         pcmk__str_casei)) {
 541         /* We were notified of our own fencing. Most likely, either fencing was
 542          * misconfigured, or fabric fencing that doesn't cut cluster
 543          * communication is in use.
 544          *
 545          * Either way, shutting down the local host is a good idea, to require
 546          * administrator intervention. Also, other nodes would otherwise likely
 547          * set our status to lost because of the fencing callback and discard
 548          * our subsequent election votes as "not part of our cluster".
 549          */
 550         crm_crit("We were allegedly just fenced by %s for %s!",
 551                  executioner, event->origin); // Dumps blackbox if enabled
 552         if (fence_reaction_panic) {
 553             pcmk__panic(__func__);
 554         } else {
 555             crm_exit(CRM_EX_FATAL);
 556         }
 557         return; // Should never get here
 558     }
 559 
 560     /* Update the count of fencing failures for this target, in case we become
 561      * DC later. The current DC has already updated its fail count in
 562      * tengine_stonith_callback().
 563      */
 564     if (!AM_I_DC) {
 565         if (succeeded) {
 566             st_fail_count_reset(event->target);
 567         } else {
 568             st_fail_count_increment(event->target);
 569         }
 570     }
 571 
 572     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
 573                "%s%s%s%s " CRM_XS " event=%s",
 574                event->target, (succeeded? "" : " not"),
 575                event->action, executioner, client, event->origin,
 576                (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
 577                ((reason == NULL)? "" : " ("),
 578                ((reason == NULL)? "" : reason),
 579                ((reason == NULL)? "" : ")"),
 580                event->id);
 581 
 582     if (succeeded) {
 583         crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
 584                                                          CRM_GET_PEER_ANY);
 585         const char *uuid = NULL;
 586 
 587         if (peer == NULL) {
 588             return;
 589         }
 590 
 591         uuid = crm_peer_uuid(peer);
 592 
 593         if (AM_I_DC) {
 594             /* The DC always sends updates */
 595             send_stonith_update(NULL, event->target, uuid);
 596 
 597             /* @TODO Ideally, at this point, we'd check whether the fenced node
 598              * hosted any guest nodes, and call remote_node_down() for them.
 599              * Unfortunately, the controller doesn't have a simple, reliable way
 600              * to map hosts to guests. It might be possible to track this in the
 601              * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
 602              * on the scheduler creating fence pseudo-events for the guests.
 603              */
 604 
 605             if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
 606                 /* Abort the current transition if it wasn't the cluster that
 607                  * initiated fencing.
 608                  */
 609                 crm_info("External fencing operation from %s fenced %s",
 610                          client, event->target);
 611                 abort_transition(INFINITY, pcmk__graph_restart,
 612                                  "External Fencing Operation", NULL);
 613             }
 614 
 615         } else if (pcmk__str_eq(controld_globals.dc_name, event->target,
 616                                 pcmk__str_null_matches|pcmk__str_casei)
 617                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 618             // Assume the target was our DC if we don't currently have one
 619 
 620             if (controld_globals.dc_name != NULL) {
 621                 crm_notice("Fencing target %s was our DC", event->target);
 622             } else {
 623                 crm_notice("Fencing target %s may have been our DC",
 624                            event->target);
 625             }
 626 
 627             /* Given the CIB resyncing that occurs around elections,
 628              * have one node update the CIB now and, if the new DC is different,
 629              * have them do so too after the election
 630              */
 631             if (pcmk__str_eq(event->executioner, controld_globals.our_nodename,
 632                              pcmk__str_casei)) {
 633                 send_stonith_update(NULL, event->target, uuid);
 634             }
 635             add_stonith_cleanup(event->target);
 636         }
 637 
 638         /* If the target is a remote node, and we host its connection,
 639          * immediately fail all monitors so it can be recovered quickly.
 640          * The connection won't necessarily drop when a remote node is fenced,
 641          * so the failure might not otherwise be detected until the next poke.
 642          */
 643         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 644             remote_ra_fail(event->target);
 645         }
 646 
 647         crmd_peer_down(peer, TRUE);
 648      }
 649 }
 650 
 651 /*!
 652  * \brief Connect to fencer
 653  *
 654  * \param[in] user_data  If NULL, retry failures now, otherwise retry in mainloop timer
 655  *
 656  * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
 657  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 658  *       30 attempts, meaning the controller could be blocked as long as 58s.
 659  */
 660 gboolean
 661 controld_timer_fencer_connect(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 662 {
 663     int rc = pcmk_ok;
 664 
 665     if (stonith_api == NULL) {
 666         stonith_api = stonith_api_new();
 667         if (stonith_api == NULL) {
 668             crm_err("Could not connect to fencer: API memory allocation failed");
 669             return G_SOURCE_REMOVE;
 670         }
 671     }
 672 
 673     if (stonith_api->state != stonith_disconnected) {
 674         crm_trace("Already connected to fencer, no need to retry");
 675         return G_SOURCE_REMOVE;
 676     }
 677 
 678     if (user_data == NULL) {
 679         // Blocking (retry failures now until successful)
 680         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 681         if (rc != pcmk_ok) {
 682             crm_err("Could not connect to fencer in 30 attempts: %s "
 683                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 684         }
 685     } else {
 686         // Non-blocking (retry failures later in main loop)
 687         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 688 
 689         if (controld_fencer_connect_timer == NULL) {
 690             controld_fencer_connect_timer =
 691                 mainloop_timer_add("controld_fencer_connect", 1000,
 692                                    TRUE, controld_timer_fencer_connect,
 693                                    GINT_TO_POINTER(TRUE));
 694         }
 695 
 696         if (rc != pcmk_ok) {
 697             if (pcmk_is_set(controld_globals.fsa_input_register,
 698                             R_ST_REQUIRED)) {
 699                 crm_notice("Fencer connection failed (will retry): %s "
 700                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 701 
 702                 if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 703                     mainloop_timer_start(controld_fencer_connect_timer);
 704                 }
 705 
 706                 return G_SOURCE_CONTINUE;
 707             } else {
 708                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 709                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 710             }
 711             return G_SOURCE_REMOVE;
 712         }
 713     }
 714 
 715     if (rc == pcmk_ok) {
 716         stonith_api->cmds->register_notification(stonith_api,
 717                                                  T_STONITH_NOTIFY_DISCONNECT,
 718                                                  tengine_stonith_connection_destroy);
 719         stonith_api->cmds->register_notification(stonith_api,
 720                                                  T_STONITH_NOTIFY_FENCE,
 721                                                  handle_fence_notification);
 722         stonith_api->cmds->register_notification(stonith_api,
 723                                                  T_STONITH_NOTIFY_HISTORY_SYNCED,
 724                                                  tengine_stonith_history_synced);
 725         te_trigger_stonith_history_sync(TRUE);
 726         crm_notice("Fencer successfully connected");
 727     }
 728 
 729     return G_SOURCE_REMOVE;
 730 }
 731 
 732 void
 733 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 734 {
 735     if (stonith_api) {
 736         // Prevent fencer connection from coming up again
 737         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 738 
 739         if (stonith_api->state != stonith_disconnected) {
 740             stonith_api->cmds->disconnect(stonith_api);
 741         }
 742         stonith_api->cmds->remove_notification(stonith_api, NULL);
 743     }
 744     if (destroy) {
 745         if (stonith_api) {
 746             stonith_api->cmds->free(stonith_api);
 747             stonith_api = NULL;
 748         }
 749         if (controld_fencer_connect_timer) {
 750             mainloop_timer_del(controld_fencer_connect_timer);
 751             controld_fencer_connect_timer = NULL;
 752         }
 753         if (te_client_id) {
 754             free(te_client_id);
 755             te_client_id = NULL;
 756         }
 757     }
 758 }
 759 
 760 static gboolean
 761 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 762 {
 763     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 764         stonith_history_t *history = NULL;
 765 
 766         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 767         stonith_api->cmds->history(stonith_api,
 768                                    st_opt_sync_call | st_opt_broadcast,
 769                                    NULL, &history, 5);
 770         stonith_history_free(history);
 771         return TRUE;
 772     } else {
 773         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 774         return FALSE;
 775     }
 776 }
 777 
 778 static void
 779 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 780 {
 781     char *uuid = NULL;
 782     int stonith_id = -1;
 783     int transition_id = -1;
 784     pcmk__graph_action_t *action = NULL;
 785     const char *target = NULL;
 786 
 787     if ((data == NULL) || (data->userdata == NULL)) {
 788         crm_err("Ignoring fence operation %d result: "
 789                 "No transition key given (bug?)",
 790                 ((data == NULL)? -1 : data->call_id));
 791         return;
 792     }
 793 
 794     if (!AM_I_DC) {
 795         const char *reason = stonith__exit_reason(data);
 796 
 797         if (reason == NULL) {
 798            reason = pcmk_exec_status_str(stonith__execution_status(data));
 799         }
 800         crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
 801                    data->call_id, stonith__exit_status(data), reason,
 802                    (const char *) data->userdata);
 803         return;
 804     }
 805 
 806     CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
 807                                     &stonith_id, NULL),
 808               goto bail);
 809 
 810     if (controld_globals.transition_graph->complete || (stonith_id < 0)
 811         || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
 812         || (controld_globals.transition_graph->id != transition_id)) {
 813         crm_info("Ignoring fence operation %d result: "
 814                  "Not from current transition " CRM_XS
 815                  " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
 816                  data->call_id,
 817                  pcmk__btoa(controld_globals.transition_graph->complete),
 818                  stonith_id, uuid, controld_globals.te_uuid, transition_id,
 819                  controld_globals.transition_graph->id);
 820         goto bail;
 821     }
 822 
 823     action = controld_get_action(stonith_id);
 824     if (action == NULL) {
 825         crm_err("Ignoring fence operation %d result: "
 826                 "Action %d not found in transition graph (bug?) "
 827                 CRM_XS " uuid=%s transition=%d",
 828                 data->call_id, stonith_id, uuid, transition_id);
 829         goto bail;
 830     }
 831 
 832     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 833     if (target == NULL) {
 834         crm_err("Ignoring fence operation %d result: No target given (bug?)",
 835                 data->call_id);
 836         goto bail;
 837     }
 838 
 839     stop_te_timer(action);
 840     if (stonith__exit_status(data) == CRM_EX_OK) {
 841         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 842         const char *op = crm_meta_value(action->params, "stonith_action");
 843 
 844         crm_info("Fence operation %d for %s succeeded", data->call_id, target);
 845         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 846             te_action_confirmed(action, NULL);
 847             if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
 848                 const char *value = NULL;
 849                 char *now = pcmk__ttoa(time(NULL));
 850                 gboolean is_remote_node = FALSE;
 851 
 852                 /* This check is not 100% reliable, since this node is not
 853                  * guaranteed to have the remote node cached. However, it
 854                  * doesn't have to be reliable, since the attribute manager can
 855                  * learn a node's "remoteness" by other means sooner or later.
 856                  * This allows it to learn more quickly if this node does have
 857                  * the information.
 858                  */
 859                 if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
 860                     is_remote_node = TRUE;
 861                 }
 862 
 863                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 864                              is_remote_node);
 865                 free(now);
 866 
 867                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
 868                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 869                              is_remote_node);
 870 
 871                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
 872                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 873                              is_remote_node);
 874 
 875             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 876                 send_stonith_update(action, target, uuid);
 877                 pcmk__set_graph_action_flags(action,
 878                                              pcmk__graph_action_sent_update);
 879             }
 880         }
 881         st_fail_count_reset(target);
 882 
 883     } else {
 884         enum pcmk__graph_next abort_action = pcmk__graph_restart;
 885         int status = stonith__execution_status(data);
 886         const char *reason = stonith__exit_reason(data);
 887 
 888         if (reason == NULL) {
 889             if (status == PCMK_EXEC_DONE) {
 890                 reason = "Agent returned error";
 891             } else {
 892                 reason = pcmk_exec_status_str(status);
 893             }
 894         }
 895         pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 896 
 897         /* If no fence devices were available, there's no use in immediately
 898          * checking again, so don't start a new transition in that case.
 899          */
 900         if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
 901             crm_warn("Fence operation %d for %s failed: %s "
 902                      "(aborting transition and giving up for now)",
 903                      data->call_id, target, reason);
 904             abort_action = pcmk__graph_wait;
 905         } else {
 906             crm_notice("Fence operation %d for %s failed: %s "
 907                        "(aborting transition)", data->call_id, target, reason);
 908         }
 909 
 910         /* Increment the fail count now, so abort_for_stonith_failure() can
 911          * check it. Non-DC nodes will increment it in
 912          * handle_fence_notification().
 913          */
 914         st_fail_count_increment(target);
 915         abort_for_stonith_failure(abort_action, target, NULL);
 916     }
 917 
 918     pcmk__update_graph(controld_globals.transition_graph, action);
 919     trigger_graph();
 920 
 921   bail:
 922     free(data->userdata);
 923     free(uuid);
 924     return;
 925 }
 926 
 927 static int
 928 fence_with_delay(const char *target, const char *type, int delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 929 {
 930     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 931     int timeout_sec = (int) (controld_globals.transition_graph->stonith_timeout
 932                              / 1000);
 933 
 934     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 935         stonith__set_call_options(options, target, st_opt_allow_suicide);
 936     }
 937     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 938                                                type, timeout_sec, 0, delay);
 939 }
 940 
 941 /*!
 942  * \internal
 943  * \brief Execute a fencing action from a transition graph
 944  *
 945  * \param[in] graph   Transition graph being executed (ignored)
 946  * \param[in] action  Fencing action to execute
 947  *
 948  * \return Standard Pacemaker return code
 949  */
 950 int
 951 controld_execute_fence_action(pcmk__graph_t *graph,
     /* [previous][next][first][last][top][bottom][index][help] */
 952                               pcmk__graph_action_t *action)
 953 {
 954     int rc = 0;
 955     const char *id = ID(action->xml);
 956     const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 957     const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 958     const char *type = crm_meta_value(action->params, "stonith_action");
 959     char *transition_key = NULL;
 960     const char *priority_delay = NULL;
 961     int delay_i = 0;
 962     gboolean invalid_action = FALSE;
 963     int stonith_timeout = (int) (controld_globals.transition_graph->stonith_timeout
 964                                  / 1000);
 965 
 966     CRM_CHECK(id != NULL, invalid_action = TRUE);
 967     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 968     CRM_CHECK(type != NULL, invalid_action = TRUE);
 969     CRM_CHECK(target != NULL, invalid_action = TRUE);
 970 
 971     if (invalid_action) {
 972         crm_log_xml_warn(action->xml, "BadAction");
 973         return EPROTO;
 974     }
 975 
 976     priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 977 
 978     crm_notice("Requesting fencing (%s) targeting node %s "
 979                CRM_XS " action=%s timeout=%i%s%s",
 980                type, target, id, stonith_timeout,
 981                priority_delay ? " priority_delay=" : "",
 982                priority_delay ? priority_delay : "");
 983 
 984     /* Passing NULL means block until we can connect... */
 985     controld_timer_fencer_connect(NULL);
 986 
 987     pcmk__scan_min_int(priority_delay, &delay_i, 0);
 988     rc = fence_with_delay(target, type, delay_i);
 989     transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
 990                                           action->id, 0,
 991                                           controld_globals.te_uuid),
 992     stonith_api->cmds->register_callback(stonith_api, rc,
 993                                          (stonith_timeout
 994                                           + (delay_i > 0 ? delay_i : 0)),
 995                                          st_opt_timeout_updates, transition_key,
 996                                          "tengine_stonith_callback",
 997                                          tengine_stonith_callback);
 998     return pcmk_rc_ok;
 999 }
1000 
1001 bool
1002 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
1003 {
1004     long st_timeout = value? crm_get_msec(value) : 0;
1005     const char *our_nodename = controld_globals.our_nodename;
1006     gboolean rv = TRUE;
1007 
1008     if (st_timeout == 0
1009         || (stonith_api && (stonith_api->state != stonith_disconnected) &&
1010             stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
1011                                                            our_nodename))) {
1012         rv = pcmk__valid_sbd_timeout(value);
1013     }
1014     return rv;
1015 }
1016 
1017 /* end stonith API client functions */
1018 
1019 
1020 /*
1021  * stonith history synchronization
1022  *
1023  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
1024  * joins or leaves, we need to synchronize the history across all nodes.
1025  */
1026 
1027 static crm_trigger_t *stonith_history_sync_trigger = NULL;
1028 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
1029 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
1030 
1031 void
1032 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
1033 {
1034     if (free_timers) {
1035         mainloop_timer_del(stonith_history_sync_timer_short);
1036         stonith_history_sync_timer_short = NULL;
1037         mainloop_timer_del(stonith_history_sync_timer_long);
1038         stonith_history_sync_timer_long = NULL;
1039     } else {
1040         mainloop_timer_stop(stonith_history_sync_timer_short);
1041         mainloop_timer_stop(stonith_history_sync_timer_long);
1042     }
1043 
1044     if (st) {
1045         st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
1046     }
1047 }
1048 
1049 static void
1050 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
1051 {
1052     te_cleanup_stonith_history_sync(st, FALSE);
1053     crm_debug("Fence-history synced - cancel all timers");
1054 }
1055 
1056 static gboolean
1057 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1058 {
1059     mainloop_set_trigger(stonith_history_sync_trigger);
1060     return FALSE;
1061 }
1062 
1063 void
1064 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1065 {
1066     /* trigger a sync in 5s to give more nodes the
1067      * chance to show up so that we don't create
1068      * unnecessary stonith-history-sync traffic
1069      *
1070      * the long timeout of 30s is there as a fallback
1071      * so that after a successful connection to fenced
1072      * we will wait for 30s for the DC to trigger a
1073      * history-sync
1074      * if this doesn't happen we trigger a sync locally
1075      * (e.g. fenced segfaults and is restarted by pacemakerd)
1076      */
1077 
1078     /* as we are finally checking the stonith-connection
1079      * in do_stonith_history_sync we should be fine
1080      * leaving stonith_history_sync_time & stonith_history_sync_trigger
1081      * around
1082      */
1083     if (stonith_history_sync_trigger == NULL) {
1084         stonith_history_sync_trigger =
1085             mainloop_add_trigger(G_PRIORITY_LOW,
1086                                  do_stonith_history_sync, NULL);
1087     }
1088 
1089     if (long_timeout) {
1090         if(stonith_history_sync_timer_long == NULL) {
1091             stonith_history_sync_timer_long =
1092                 mainloop_timer_add("history_sync_long", 30000,
1093                                    FALSE, stonith_history_sync_set_trigger,
1094                                    NULL);
1095         }
1096         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1097         mainloop_timer_start(stonith_history_sync_timer_long);
1098     } else {
1099         if(stonith_history_sync_timer_short == NULL) {
1100             stonith_history_sync_timer_short =
1101                 mainloop_timer_add("history_sync_short", 5000,
1102                                    FALSE, stonith_history_sync_set_trigger,
1103                                    NULL);
1104         }
1105         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1106         mainloop_timer_start(stonith_history_sync_timer_short);
1107     }
1108 
1109 }
1110 
1111 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */