root/daemons/fenced/fenced_remote.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sort_strings
  2. free_remote_query
  3. free_stonith_remote_op_list
  4. count_peer_device
  5. count_peer_devices
  6. find_peer_device
  7. grab_peer_device
  8. clear_remote_op_timers
  9. free_remote_op
  10. init_stonith_remote_op_hash_table
  11. op_requested_action
  12. op_phase_off
  13. op_phase_on
  14. undo_op_remap
  15. fencing_result2xml
  16. fenced_broadcast_op_result
  17. handle_local_reply_and_notify
  18. finalize_op_duplicates
  19. delegate_from_xml
  20. finalize_op
  21. remote_op_watchdog_done
  22. remote_op_timeout_one
  23. finalize_timed_out_op
  24. remote_op_timeout
  25. remote_op_query_timeout
  26. topology_is_empty
  27. add_required_device
  28. remove_required_device
  29. set_op_device_list
  30. topology_matches
  31. find_topology_for_host
  32. advance_topology_level
  33. merge_duplicates
  34. fencing_active_peers
  35. fenced_handle_manual_confirmation
  36. create_remote_stonith_op
  37. initiate_remote_stonith_op
  38. find_best_peer
  39. stonith_choose_peer
  40. get_device_timeout
  41. add_device_timeout
  42. get_peer_timeout
  43. get_op_total_timeout
  44. report_timeout_period
  45. advance_topology_device_in_level
  46. check_watchdog_fencing_and_wait
  47. request_peer_fencing
  48. sort_peers
  49. all_topology_devices_found
  50. parse_action_specific
  51. add_device_properties
  52. add_result
  53. process_remote_stonith_query
  54. fenced_process_fencing_reply
  55. stonith_check_fence_tolerance

   1 /*
   2  * Copyright 2009-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/param.h>
  13 #include <stdio.h>
  14 #include <sys/types.h>
  15 #include <sys/wait.h>
  16 #include <sys/stat.h>
  17 #include <unistd.h>
  18 #include <sys/utsname.h>
  19 
  20 #include <stdlib.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <ctype.h>
  24 #include <regex.h>
  25 
  26 #include <crm/crm.h>
  27 #include <crm/msg_xml.h>
  28 #include <crm/common/ipc.h>
  29 #include <crm/common/ipc_internal.h>
  30 #include <crm/cluster/internal.h>
  31 
  32 #include <crm/stonith-ng.h>
  33 #include <crm/fencing/internal.h>
  34 #include <crm/common/xml.h>
  35 #include <crm/common/xml_internal.h>
  36 
  37 #include <crm/common/util.h>
  38 #include <pacemaker-fenced.h>
  39 
  40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
  41 
  42 /* When one fencer queries its peers for devices able to handle a fencing
  43  * request, each peer will reply with a list of such devices available to it.
  44  * Each reply will be parsed into a peer_device_info_t, with each device's
  45  * information kept in a device_properties_t.
  46  */
  47 
  48 typedef struct device_properties_s {
  49     /* Whether access to this device has been verified */
  50     gboolean verified;
  51 
  52     /* The remaining members are indexed by the operation's "phase" */
  53 
  54     /* Whether this device has been executed in each phase */
  55     gboolean executed[st_phase_max];
  56     /* Whether this device is disallowed from executing in each phase */
  57     gboolean disallowed[st_phase_max];
  58     /* Action-specific timeout for each phase */
  59     int custom_action_timeout[st_phase_max];
  60     /* Action-specific maximum random delay for each phase */
  61     int delay_max[st_phase_max];
  62     /* Action-specific base delay for each phase */
  63     int delay_base[st_phase_max];
  64     /* Group of enum st_device_flags */
  65     uint32_t device_support_flags;
  66 } device_properties_t;
  67 
  68 typedef struct {
  69     /* Name of peer that sent this result */
  70     char *host;
  71     /* Only try peers for non-topology based operations once */
  72     gboolean tried;
  73     /* Number of entries in the devices table */
  74     int ndevices;
  75     /* Devices available to this host that are capable of fencing the target */
  76     GHashTable *devices;
  77 } peer_device_info_t;
  78 
  79 GHashTable *stonith_remote_op_list = NULL;
  80 
  81 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
  82                                   int call_options);
  83 
  84 static void request_peer_fencing(remote_fencing_op_t *op,
  85                                  peer_device_info_t *peer);
  86 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
  87 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
  88 static int get_op_total_timeout(const remote_fencing_op_t *op,
  89                                 const peer_device_info_t *chosen_peer);
  90 
  91 static gint
  92 sort_strings(gconstpointer a, gconstpointer b)
     /* [previous][next][first][last][top][bottom][index][help] */
  93 {
  94     return strcmp(a, b);
  95 }
  96 
  97 static void
  98 free_remote_query(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
  99 {
 100     if (data != NULL) {
 101         peer_device_info_t *peer = data;
 102 
 103         g_hash_table_destroy(peer->devices);
 104         free(peer->host);
 105         free(peer);
 106     }
 107 }
 108 
 109 void
 110 free_stonith_remote_op_list(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 111 {
 112     if (stonith_remote_op_list != NULL) {
 113         g_hash_table_destroy(stonith_remote_op_list);
 114         stonith_remote_op_list = NULL;
 115     }
 116 }
 117 
 118 struct peer_count_data {
 119     const remote_fencing_op_t *op;
 120     gboolean verified_only;
 121     uint32_t support_action_only;
 122     int count;
 123 };
 124 
 125 /*!
 126  * \internal
 127  * \brief Increment a counter if a device has not been executed yet
 128  *
 129  * \param[in]     key        Device ID (ignored)
 130  * \param[in]     value      Device properties
 131  * \param[in,out] user_data  Peer count data
 132  */
 133 static void
 134 count_peer_device(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 135 {
 136     device_properties_t *props = (device_properties_t*)value;
 137     struct peer_count_data *data = user_data;
 138 
 139     if (!props->executed[data->op->phase]
 140         && (!data->verified_only || props->verified)
 141         && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
 142         ++(data->count);
 143     }
 144 }
 145 
 146 /*!
 147  * \internal
 148  * \brief Check the number of available devices in a peer's query results
 149  *
 150  * \param[in] op             Operation that results are for
 151  * \param[in] peer           Peer to count
 152  * \param[in] verified_only  Whether to count only verified devices
 153  * \param[in] support_action_only Whether to count only devices that support action
 154  *
 155  * \return Number of devices available to peer that were not already executed
 156  */
 157 static int
 158 count_peer_devices(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
 159                    const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
 160 {
 161     struct peer_count_data data;
 162 
 163     data.op = op;
 164     data.verified_only = verified_only;
 165     data.support_action_only = support_on_action_only;
 166     data.count = 0;
 167     if (peer) {
 168         g_hash_table_foreach(peer->devices, count_peer_device, &data);
 169     }
 170     return data.count;
 171 }
 172 
 173 /*!
 174  * \internal
 175  * \brief Search for a device in a query result
 176  *
 177  * \param[in] op      Operation that result is for
 178  * \param[in] peer    Query result for a peer
 179  * \param[in] device  Device ID to search for
 180  *
 181  * \return Device properties if found, NULL otherwise
 182  */
 183 static device_properties_t *
 184 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
     /* [previous][next][first][last][top][bottom][index][help] */
 185                  const char *device, uint32_t support_action_only)
 186 {
 187     device_properties_t *props = g_hash_table_lookup(peer->devices, device);
 188 
 189     if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
 190         return NULL;
 191     }
 192     return (props && !props->executed[op->phase]
 193            && !props->disallowed[op->phase])? props : NULL;
 194 }
 195 
 196 /*!
 197  * \internal
 198  * \brief Find a device in a peer's device list and mark it as executed
 199  *
 200  * \param[in]     op                     Operation that peer result is for
 201  * \param[in,out] peer                   Peer with results to search
 202  * \param[in]     device                 ID of device to mark as done
 203  * \param[in]     verified_devices_only  Only consider verified devices
 204  *
 205  * \return TRUE if device was found and marked, FALSE otherwise
 206  */
 207 static gboolean
 208 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
     /* [previous][next][first][last][top][bottom][index][help] */
 209                  const char *device, gboolean verified_devices_only)
 210 {
 211     device_properties_t *props = find_peer_device(op, peer, device,
 212                                                   fenced_support_flag(op->action));
 213 
 214     if ((props == NULL) || (verified_devices_only && !props->verified)) {
 215         return FALSE;
 216     }
 217 
 218     crm_trace("Removing %s from %s (%d remaining)",
 219               device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
 220     props->executed[op->phase] = TRUE;
 221     return TRUE;
 222 }
 223 
 224 static void
 225 clear_remote_op_timers(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 226 {
 227     if (op->query_timer) {
 228         g_source_remove(op->query_timer);
 229         op->query_timer = 0;
 230     }
 231     if (op->op_timer_total) {
 232         g_source_remove(op->op_timer_total);
 233         op->op_timer_total = 0;
 234     }
 235     if (op->op_timer_one) {
 236         g_source_remove(op->op_timer_one);
 237         op->op_timer_one = 0;
 238     }
 239 }
 240 
 241 static void
 242 free_remote_op(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 243 {
 244     remote_fencing_op_t *op = data;
 245 
 246     crm_log_xml_debug(op->request, "Destroying");
 247 
 248     clear_remote_op_timers(op);
 249 
 250     free(op->id);
 251     free(op->action);
 252     free(op->delegate);
 253     free(op->target);
 254     free(op->client_id);
 255     free(op->client_name);
 256     free(op->originator);
 257 
 258     if (op->query_results) {
 259         g_list_free_full(op->query_results, free_remote_query);
 260     }
 261     if (op->request) {
 262         free_xml(op->request);
 263         op->request = NULL;
 264     }
 265     if (op->devices_list) {
 266         g_list_free_full(op->devices_list, free);
 267         op->devices_list = NULL;
 268     }
 269     g_list_free_full(op->automatic_list, free);
 270     g_list_free(op->duplicates);
 271 
 272     pcmk__reset_result(&op->result);
 273     free(op);
 274 }
 275 
 276 void
 277 init_stonith_remote_op_hash_table(GHashTable **table)
     /* [previous][next][first][last][top][bottom][index][help] */
 278 {
 279     if (*table == NULL) {
 280         *table = pcmk__strkey_table(NULL, free_remote_op);
 281     }
 282 }
 283 
 284 /*!
 285  * \internal
 286  * \brief Return an operation's originally requested action (before any remap)
 287  *
 288  * \param[in] op  Operation to check
 289  *
 290  * \return Operation's original action
 291  */
 292 static const char *
 293 op_requested_action(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 294 {
 295     return ((op->phase > st_phase_requested)? PCMK_ACTION_REBOOT : op->action);
 296 }
 297 
 298 /*!
 299  * \internal
 300  * \brief Remap a "reboot" operation to the "off" phase
 301  *
 302  * \param[in,out] op      Operation to remap
 303  */
 304 static void
 305 op_phase_off(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 306 {
 307     crm_info("Remapping multiple-device reboot targeting %s to 'off' "
 308              CRM_XS " id=%.8s", op->target, op->id);
 309     op->phase = st_phase_off;
 310 
 311     /* Happily, "off" and "on" are shorter than "reboot", so we can reuse the
 312      * memory allocation at each phase.
 313      */
 314     strcpy(op->action, PCMK_ACTION_OFF);
 315 }
 316 
 317 /*!
 318  * \internal
 319  * \brief Advance a remapped reboot operation to the "on" phase
 320  *
 321  * \param[in,out] op  Operation to remap
 322  */
 323 static void
 324 op_phase_on(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 325 {
 326     GList *iter = NULL;
 327 
 328     crm_info("Remapped 'off' targeting %s complete, "
 329              "remapping to 'on' for %s " CRM_XS " id=%.8s",
 330              op->target, op->client_name, op->id);
 331     op->phase = st_phase_on;
 332     strcpy(op->action, PCMK_ACTION_ON);
 333 
 334     /* Skip devices with automatic unfencing, because the cluster will handle it
 335      * when the node rejoins.
 336      */
 337     for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
 338         GList *match = g_list_find_custom(op->devices_list, iter->data,
 339                                             sort_strings);
 340 
 341         if (match) {
 342             op->devices_list = g_list_remove(op->devices_list, match->data);
 343         }
 344     }
 345     g_list_free_full(op->automatic_list, free);
 346     op->automatic_list = NULL;
 347 
 348     /* Rewind device list pointer */
 349     op->devices = op->devices_list;
 350 }
 351 
 352 /*!
 353  * \internal
 354  * \brief Reset a remapped reboot operation
 355  *
 356  * \param[in,out] op  Operation to reset
 357  */
 358 static void
 359 undo_op_remap(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 360 {
 361     if (op->phase > 0) {
 362         crm_info("Undoing remap of reboot targeting %s for %s "
 363                  CRM_XS " id=%.8s", op->target, op->client_name, op->id);
 364         op->phase = st_phase_requested;
 365         strcpy(op->action, PCMK_ACTION_REBOOT);
 366     }
 367 }
 368 
 369 /*!
 370  * \internal
 371  * \brief Create notification data XML for a fencing operation result
 372  *
 373  * \param[in] op      Fencer operation that completed
 374  *
 375  * \return Newly created XML to add as notification data
 376  * \note The caller is responsible for freeing the result.
 377  */
 378 static xmlNode *
 379 fencing_result2xml(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 380 {
 381     xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
 382 
 383     crm_xml_add_int(notify_data, "state", op->state);
 384     crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
 385     crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
 386     crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
 387     crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
 388     crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
 389     crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
 390     crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
 391 
 392     return notify_data;
 393 }
 394 
 395 /*!
 396  * \internal
 397  * \brief Broadcast a fence result notification to all CPG peers
 398  *
 399  * \param[in] op         Fencer operation that completed
 400  * \param[in] op_merged  Whether this operation is a duplicate of another
 401  */
 402 void
 403 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
     /* [previous][next][first][last][top][bottom][index][help] */
 404 {
 405     static int count = 0;
 406     xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
 407     xmlNode *notify_data = fencing_result2xml(op);
 408 
 409     count++;
 410     crm_trace("Broadcasting result to peers");
 411     crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
 412     crm_xml_add(bcast, F_SUBTYPE, "broadcast");
 413     crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
 414     crm_xml_add_int(bcast, "count", count);
 415 
 416     if (op_merged) {
 417         pcmk__xe_set_bool_attr(bcast, F_STONITH_MERGED, true);
 418     }
 419 
 420     stonith__xe_set_result(notify_data, &op->result);
 421 
 422     add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
 423     send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
 424     free_xml(notify_data);
 425     free_xml(bcast);
 426 
 427     return;
 428 }
 429 
 430 /*!
 431  * \internal
 432  * \brief Reply to a local request originator and notify all subscribed clients
 433  *
 434  * \param[in,out] op    Fencer operation that completed
 435  * \param[in,out] data  Top-level XML to add notification to
 436  */
 437 static void
 438 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 439 {
 440     xmlNode *notify_data = NULL;
 441     xmlNode *reply = NULL;
 442     pcmk__client_t *client = NULL;
 443 
 444     if (op->notify_sent == TRUE) {
 445         /* nothing to do */
 446         return;
 447     }
 448 
 449     /* Do notification with a clean data object */
 450     crm_xml_add_int(data, "state", op->state);
 451     crm_xml_add(data, F_STONITH_TARGET, op->target);
 452     crm_xml_add(data, F_STONITH_OPERATION, op->action);
 453 
 454     reply = fenced_construct_reply(op->request, data, &op->result);
 455     crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
 456 
 457     /* Send fencing OP reply to local client that initiated fencing */
 458     client = pcmk__find_client_by_id(op->client_id);
 459     if (client == NULL) {
 460         crm_trace("Skipping reply to %s: no longer a client", op->client_id);
 461     } else {
 462         do_local_reply(reply, client, op->call_options);
 463     }
 464 
 465     /* bcast to all local clients that the fencing operation happend */
 466     notify_data = fencing_result2xml(op);
 467     fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data);
 468     free_xml(notify_data);
 469     fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
 470 
 471     /* mark this op as having notify's already sent */
 472     op->notify_sent = TRUE;
 473     free_xml(reply);
 474 }
 475 
 476 /*!
 477  * \internal
 478  * \brief Finalize all duplicates of a given fencer operation
 479  *
 480  * \param[in,out] op    Fencer operation that completed
 481  * \param[in,out] data  Top-level XML to add notification to
 482  */
 483 static void
 484 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 485 {
 486     for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
 487         remote_fencing_op_t *other = iter->data;
 488 
 489         if (other->state == st_duplicate) {
 490             other->state = op->state;
 491             crm_debug("Performing duplicate notification for %s@%s: %s "
 492                       CRM_XS " id=%.8s",
 493                       other->client_name, other->originator,
 494                       pcmk_exec_status_str(op->result.execution_status),
 495                       other->id);
 496             pcmk__copy_result(&op->result, &other->result);
 497             finalize_op(other, data, true);
 498 
 499         } else {
 500             // Possible if (for example) it timed out already
 501             crm_err("Skipping duplicate notification for %s@%s "
 502                     CRM_XS " state=%s id=%.8s",
 503                     other->client_name, other->originator,
 504                     stonith_op_state_str(other->state), other->id);
 505         }
 506     }
 507 }
 508 
 509 static char *
 510 delegate_from_xml(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
 511 {
 512     xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
 513 
 514     if (match == NULL) {
 515         return crm_element_value_copy(xml, F_ORIG);
 516     } else {
 517         return crm_element_value_copy(match, F_STONITH_DELEGATE);
 518     }
 519 }
 520 
 521 /*!
 522  * \internal
 523  * \brief Finalize a peer fencing operation
 524  *
 525  * Clean up after a fencing operation completes. This function has two code
 526  * paths: the executioner uses it to broadcast the result to CPG peers, and then
 527  * each peer (including the executioner) uses it to process that broadcast and
 528  * notify its IPC clients of the result.
 529  *
 530  * \param[in,out] op      Fencer operation that completed
 531  * \param[in,out] data    If not NULL, XML reply of last delegated operation
 532  * \param[in]     dup     Whether this operation is a duplicate of another
 533  *                        (in which case, do not broadcast the result)
 534  *
 535  *  \note The operation result should be set before calling this function.
 536  */
 537 static void
 538 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
     /* [previous][next][first][last][top][bottom][index][help] */
 539 {
 540     int level = LOG_ERR;
 541     const char *subt = NULL;
 542     xmlNode *local_data = NULL;
 543     gboolean op_merged = FALSE;
 544 
 545     CRM_CHECK((op != NULL), return);
 546 
 547     // This is a no-op if timers have already been cleared
 548     clear_remote_op_timers(op);
 549 
 550     if (op->notify_sent) {
 551         // Most likely, this is a timed-out action that eventually completed
 552         crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
 553                    "Result arrived too late " CRM_XS " id=%.8s",
 554                    op->action, (op->target? " targeting " : ""),
 555                    (op->target? op->target : ""),
 556                    (op->delegate? op->delegate : "unknown node"),
 557                    op->client_name, op->originator,
 558                    (op_merged? " (merged)" : ""),
 559                    op->id);
 560         return;
 561     }
 562 
 563     set_fencing_completed(op);
 564     undo_op_remap(op);
 565 
 566     if (data == NULL) {
 567         data = create_xml_node(NULL, "remote-op");
 568         local_data = data;
 569 
 570     } else if (op->delegate == NULL) {
 571         switch (op->result.execution_status) {
 572             case PCMK_EXEC_NO_FENCE_DEVICE:
 573                 break;
 574 
 575             case PCMK_EXEC_INVALID:
 576                 if (op->result.exit_status != CRM_EX_EXPIRED) {
 577                     op->delegate = delegate_from_xml(data);
 578                 }
 579                 break;
 580 
 581             default:
 582                 op->delegate = delegate_from_xml(data);
 583                 break;
 584         }
 585     }
 586 
 587     if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) {
 588         op_merged = true;
 589     }
 590 
 591     /* Tell everyone the operation is done, we will continue
 592      * with doing the local notifications once we receive
 593      * the broadcast back. */
 594     subt = crm_element_value(data, F_SUBTYPE);
 595     if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
 596         /* Defer notification until the bcast message arrives */
 597         fenced_broadcast_op_result(op, op_merged);
 598         free_xml(local_data);
 599         return;
 600     }
 601 
 602     if (pcmk__result_ok(&op->result) || dup
 603         || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
 604         level = LOG_NOTICE;
 605     }
 606     do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
 607                CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
 608                (op->target? op->target : ""),
 609                (op->delegate? op->delegate : "unknown node"),
 610                op->client_name, op->originator,
 611                (op_merged? " (merged)" : ""),
 612                crm_exit_str(op->result.exit_status),
 613                pcmk_exec_status_str(op->result.execution_status),
 614                ((op->result.exit_reason == NULL)? "" : ": "),
 615                ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
 616                op->id);
 617 
 618     handle_local_reply_and_notify(op, data);
 619 
 620     if (!dup) {
 621         finalize_op_duplicates(op, data);
 622     }
 623 
 624     /* Free non-essential parts of the record
 625      * Keep the record around so we can query the history
 626      */
 627     if (op->query_results) {
 628         g_list_free_full(op->query_results, free_remote_query);
 629         op->query_results = NULL;
 630     }
 631     if (op->request) {
 632         free_xml(op->request);
 633         op->request = NULL;
 634     }
 635 
 636     free_xml(local_data);
 637 }
 638 
 639 /*!
 640  * \internal
 641  * \brief Finalize a watchdog fencer op after the waiting time expires
 642  *
 643  * \param[in,out] userdata  Fencer operation that completed
 644  *
 645  * \return G_SOURCE_REMOVE (which tells glib not to restart timer)
 646  */
 647 static gboolean
 648 remote_op_watchdog_done(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 649 {
 650     remote_fencing_op_t *op = userdata;
 651 
 652     op->op_timer_one = 0;
 653 
 654     crm_notice("Self-fencing (%s) by %s for %s assumed complete "
 655                CRM_XS " id=%.8s",
 656                op->action, op->target, op->client_name, op->id);
 657     op->state = st_done;
 658     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
 659     finalize_op(op, NULL, false);
 660     return G_SOURCE_REMOVE;
 661 }
 662 
 663 static gboolean
 664 remote_op_timeout_one(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 665 {
 666     remote_fencing_op_t *op = userdata;
 667 
 668     op->op_timer_one = 0;
 669 
 670     crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
 671                " id=%.8s", op->action, op->target, op->client_name, op->id);
 672     pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
 673                      "Peer did not return fence result within timeout");
 674 
 675     // The requested delay has been applied for the first device
 676     if (op->client_delay > 0) {
 677         op->client_delay = 0;
 678         crm_trace("Try another device for '%s' action targeting %s "
 679                   "for client %s without delay " CRM_XS " id=%.8s",
 680                   op->action, op->target, op->client_name, op->id);
 681     }
 682 
 683     // Try another device, if appropriate
 684     request_peer_fencing(op, NULL);
 685     return G_SOURCE_REMOVE;
 686 }
 687 
 688 /*!
 689  * \internal
 690  * \brief Finalize a remote fencer operation that timed out
 691  *
 692  * \param[in,out] op      Fencer operation that timed out
 693  * \param[in]     reason  Readable description of what step timed out
 694  */
 695 static void
 696 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
     /* [previous][next][first][last][top][bottom][index][help] */
 697 {
 698     crm_debug("Action '%s' targeting %s for client %s timed out "
 699               CRM_XS " id=%.8s",
 700               op->action, op->target, op->client_name, op->id);
 701 
 702     if (op->phase == st_phase_on) {
 703         /* A remapped reboot operation timed out in the "on" phase, but the
 704          * "off" phase completed successfully, so quit trying any further
 705          * devices, and return success.
 706          */
 707         op->state = st_done;
 708         pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
 709     } else {
 710         op->state = st_failed;
 711         pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
 712     }
 713     finalize_op(op, NULL, false);
 714 }
 715 
 716 /*!
 717  * \internal
 718  * \brief Finalize a remote fencer operation that timed out
 719  *
 720  * \param[in,out] userdata  Fencer operation that timed out
 721  *
 722  * \return G_SOURCE_REMOVE (which tells glib not to restart timer)
 723  */
 724 static gboolean
 725 remote_op_timeout(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 726 {
 727     remote_fencing_op_t *op = userdata;
 728 
 729     op->op_timer_total = 0;
 730 
 731     if (op->state == st_done) {
 732         crm_debug("Action '%s' targeting %s for client %s already completed "
 733                   CRM_XS " id=%.8s",
 734                   op->action, op->target, op->client_name, op->id);
 735     } else {
 736         finalize_timed_out_op(userdata, "Fencing did not complete within a "
 737                                         "total timeout based on the "
 738                                         "configured timeout and retries for "
 739                                         "any devices attempted");
 740     }
 741     return G_SOURCE_REMOVE;
 742 }
 743 
 744 static gboolean
 745 remote_op_query_timeout(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 746 {
 747     remote_fencing_op_t *op = data;
 748 
 749     op->query_timer = 0;
 750 
 751     if (op->state == st_done) {
 752         crm_debug("Operation %.8s targeting %s already completed",
 753                   op->id, op->target);
 754     } else if (op->state == st_exec) {
 755         crm_debug("Operation %.8s targeting %s already in progress",
 756                   op->id, op->target);
 757     } else if (op->query_results) {
 758         // Query succeeded, so attempt the actual fencing
 759         crm_debug("Query %.8s targeting %s complete (state=%s)",
 760                   op->id, op->target, stonith_op_state_str(op->state));
 761         request_peer_fencing(op, NULL);
 762     } else {
 763         crm_debug("Query %.8s targeting %s timed out (state=%s)",
 764                   op->id, op->target, stonith_op_state_str(op->state));
 765         finalize_timed_out_op(op, "No capable peers replied to device query "
 766                                   "within timeout");
 767     }
 768 
 769     return G_SOURCE_REMOVE;
 770 }
 771 
 772 static gboolean
 773 topology_is_empty(stonith_topology_t *tp)
     /* [previous][next][first][last][top][bottom][index][help] */
 774 {
 775     int i;
 776 
 777     if (tp == NULL) {
 778         return TRUE;
 779     }
 780 
 781     for (i = 0; i < ST_LEVEL_MAX; i++) {
 782         if (tp->levels[i] != NULL) {
 783             return FALSE;
 784         }
 785     }
 786     return TRUE;
 787 }
 788 
 789 /*!
 790  * \internal
 791  * \brief Add a device to an operation's automatic unfencing list
 792  *
 793  * \param[in,out] op      Operation to modify
 794  * \param[in]     device  Device ID to add
 795  */
 796 static void
 797 add_required_device(remote_fencing_op_t *op, const char *device)
     /* [previous][next][first][last][top][bottom][index][help] */
 798 {
 799     GList *match  = g_list_find_custom(op->automatic_list, device,
 800                                          sort_strings);
 801 
 802     if (!match) {
 803         op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
 804     }
 805 }
 806 
 807 /*!
 808  * \internal
 809  * \brief Remove a device from the automatic unfencing list
 810  *
 811  * \param[in,out] op      Operation to modify
 812  * \param[in]     device  Device ID to remove
 813  */
 814 static void
 815 remove_required_device(remote_fencing_op_t *op, const char *device)
     /* [previous][next][first][last][top][bottom][index][help] */
 816 {
 817     GList *match = g_list_find_custom(op->automatic_list, device,
 818                                         sort_strings);
 819 
 820     if (match) {
 821         op->automatic_list = g_list_remove(op->automatic_list, match->data);
 822     }
 823 }
 824 
 825 /* deep copy the device list */
 826 static void
 827 set_op_device_list(remote_fencing_op_t * op, GList *devices)
     /* [previous][next][first][last][top][bottom][index][help] */
 828 {
 829     GList *lpc = NULL;
 830 
 831     if (op->devices_list) {
 832         g_list_free_full(op->devices_list, free);
 833         op->devices_list = NULL;
 834     }
 835     for (lpc = devices; lpc != NULL; lpc = lpc->next) {
 836         op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
 837     }
 838     op->devices = op->devices_list;
 839 }
 840 
 841 /*!
 842  * \internal
 843  * \brief Check whether a node matches a topology target
 844  *
 845  * \param[in] tp    Topology table entry to check
 846  * \param[in] node  Name of node to check
 847  *
 848  * \return TRUE if node matches topology target
 849  */
 850 static gboolean
 851 topology_matches(const stonith_topology_t *tp, const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 852 {
 853     regex_t r_patt;
 854 
 855     CRM_CHECK(node && tp && tp->target, return FALSE);
 856     switch (tp->kind) {
 857         case fenced_target_by_attribute:
 858             /* This level targets by attribute, so tp->target is a NAME=VALUE pair
 859              * of a permanent attribute applied to targeted nodes. The test below
 860              * relies on the locally cached copy of the CIB, so if fencing needs to
 861              * be done before the initial CIB is received or after a malformed CIB
 862              * is received, then the topology will be unable to be used.
 863              */
 864             if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
 865                 crm_notice("Matched %s with %s by attribute", node, tp->target);
 866                 return TRUE;
 867             }
 868             break;
 869 
 870         case fenced_target_by_pattern:
 871             /* This level targets node names matching a pattern, so tp->target
 872              * (and tp->target_pattern) is a regular expression.
 873              */
 874             if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
 875                 crm_info("Bad regex '%s' for fencing level", tp->target);
 876             } else {
 877                 int status = regexec(&r_patt, node, 0, NULL, 0);
 878 
 879                 regfree(&r_patt);
 880                 if (status == 0) {
 881                     crm_notice("Matched %s with %s by name", node, tp->target);
 882                     return TRUE;
 883                 }
 884             }
 885             break;
 886 
 887         case fenced_target_by_name:
 888             crm_trace("Testing %s against %s", node, tp->target);
 889             return pcmk__str_eq(tp->target, node, pcmk__str_casei);
 890 
 891         default:
 892             break;
 893     }
 894     crm_trace("No match for %s with %s", node, tp->target);
 895     return FALSE;
 896 }
 897 
 898 stonith_topology_t *
 899 find_topology_for_host(const char *host) 
     /* [previous][next][first][last][top][bottom][index][help] */
 900 {
 901     GHashTableIter tIter;
 902     stonith_topology_t *tp = g_hash_table_lookup(topology, host);
 903 
 904     if(tp != NULL) {
 905         crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
 906         return tp;
 907     }
 908 
 909     g_hash_table_iter_init(&tIter, topology);
 910     while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
 911         if (topology_matches(tp, host)) {
 912             crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
 913             return tp;
 914         }
 915     }
 916 
 917     crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
 918     return NULL;
 919 }
 920 
 921 /*!
 922  * \internal
 923  * \brief Set fencing operation's device list to target's next topology level
 924  *
 925  * \param[in,out] op        Remote fencing operation to modify
 926  * \param[in]     empty_ok  If true, an operation without a target (i.e.
 927  *                          queries) or a target without a topology will get a
 928  *                          pcmk_rc_ok return value instead of ENODEV
 929  *
 930  * \return Standard Pacemaker return value
 931  */
 932 static int
 933 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
     /* [previous][next][first][last][top][bottom][index][help] */
 934 {
 935     stonith_topology_t *tp = NULL;
 936 
 937     if (op->target) {
 938         tp = find_topology_for_host(op->target);
 939     }
 940     if (topology_is_empty(tp)) {
 941         return empty_ok? pcmk_rc_ok : ENODEV;
 942     }
 943 
 944     CRM_ASSERT(tp->levels != NULL);
 945 
 946     stonith__set_call_options(op->call_options, op->id, st_opt_topology);
 947 
 948     /* This is a new level, so undo any remapping left over from previous */
 949     undo_op_remap(op);
 950 
 951     do {
 952         op->level++;
 953 
 954     } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
 955 
 956     if (op->level < ST_LEVEL_MAX) {
 957         crm_trace("Attempting fencing level %d targeting %s (%d devices) "
 958                   "for client %s@%s (id=%.8s)",
 959                   op->level, op->target, g_list_length(tp->levels[op->level]),
 960                   op->client_name, op->originator, op->id);
 961         set_op_device_list(op, tp->levels[op->level]);
 962 
 963         // The requested delay has been applied for the first fencing level
 964         if ((op->level > 1) && (op->client_delay > 0)) {
 965             op->client_delay = 0;
 966         }
 967 
 968         if ((g_list_next(op->devices_list) != NULL)
 969             && pcmk__str_eq(op->action, PCMK_ACTION_REBOOT, pcmk__str_none)) {
 970             /* A reboot has been requested for a topology level with multiple
 971              * devices. Instead of rebooting the devices sequentially, we will
 972              * turn them all off, then turn them all on again. (Think about
 973              * switched power outlets for redundant power supplies.)
 974              */
 975             op_phase_off(op);
 976         }
 977         return pcmk_rc_ok;
 978     }
 979 
 980     crm_info("All %sfencing options targeting %s for client %s@%s failed "
 981              CRM_XS " id=%.8s",
 982              (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
 983              op->target, op->client_name, op->originator, op->id);
 984     return ENODEV;
 985 }
 986 
 987 /*!
 988  * \internal
 989  * \brief If fencing operation is a duplicate, merge it into the other one
 990  *
 991  * \param[in,out] op  Fencing operation to check
 992  */
 993 static void
 994 merge_duplicates(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 995 {
 996     GHashTableIter iter;
 997     remote_fencing_op_t *other = NULL;
 998 
 999     time_t now = time(NULL);
1000 
1001     g_hash_table_iter_init(&iter, stonith_remote_op_list);
1002     while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
1003         const char *other_action = op_requested_action(other);
1004 
1005         if (!strcmp(op->id, other->id)) {
1006             continue; // Don't compare against self
1007         }
1008         if (other->state > st_exec) {
1009             crm_trace("%.8s not duplicate of %.8s: not in progress",
1010                       op->id, other->id);
1011             continue;
1012         }
1013         if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1014             crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1015                       op->id, other->id, op->target, other->target);
1016             continue;
1017         }
1018         if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1019             crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1020                       op->id, other->id, op->action, other_action);
1021             continue;
1022         }
1023         if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1024             crm_trace("%.8s not duplicate of %.8s: same client %s",
1025                       op->id, other->id, op->client_name);
1026             continue;
1027         }
1028         if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1029             crm_trace("%.8s not duplicate of %.8s: suicide for %s",
1030                       op->id, other->id, other->target);
1031             continue;
1032         }
1033         if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
1034             crm_notice("Failing action '%s' targeting %s originating from "
1035                        "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
1036                        other->action, other->target, other->client_name,
1037                        other->originator, other->id);
1038             crm_trace("%.8s not duplicate of %.8s: originator dead",
1039                       op->id, other->id);
1040             other->state = st_failed;
1041             continue;
1042         }
1043         if ((other->total_timeout > 0)
1044             && (now > (other->total_timeout + other->created))) {
1045             crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
1046                       op->id, other->id, now, other->created,
1047                       other->total_timeout);
1048             continue;
1049         }
1050 
1051         /* There is another in-flight request to fence the same host
1052          * Piggyback on that instead.  If it fails, so do we.
1053          */
1054         other->duplicates = g_list_append(other->duplicates, op);
1055         if (other->total_timeout == 0) {
1056             other->total_timeout = op->total_timeout =
1057                 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1058             crm_trace("Best guess as to timeout used for %.8s: %d",
1059                       other->id, other->total_timeout);
1060         }
1061         crm_notice("Merging fencing action '%s' targeting %s originating from "
1062                    "client %s with identical request from %s@%s "
1063                    CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1064                    op->action, op->target, op->client_name,
1065                    other->client_name, other->originator,
1066                    op->id, other->id, other->total_timeout);
1067         report_timeout_period(op, other->total_timeout);
1068         op->state = st_duplicate;
1069     }
1070 }
1071 
1072 static uint32_t fencing_active_peers(void)
     /* [previous][next][first][last][top][bottom][index][help] */
1073 {
1074     uint32_t count = 0;
1075     crm_node_t *entry;
1076     GHashTableIter gIter;
1077 
1078     g_hash_table_iter_init(&gIter, crm_peer_cache);
1079     while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1080         if(fencing_peer_active(entry)) {
1081             count++;
1082         }
1083     }
1084     return count;
1085 }
1086 
1087 /*!
1088  * \internal
1089  * \brief Process a manual confirmation of a pending fence action
1090  *
1091  * \param[in]     client  IPC client that sent confirmation
1092  * \param[in,out] msg     Request XML with manual confirmation
1093  *
1094  * \return Standard Pacemaker return code
1095  */
1096 int
1097 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
1098 {
1099     remote_fencing_op_t *op = NULL;
1100     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
1101 
1102     CRM_CHECK(dev != NULL, return EPROTO);
1103 
1104     crm_notice("Received manual confirmation that %s has been fenced",
1105                pcmk__s(crm_element_value(dev, F_STONITH_TARGET),
1106                        "unknown target"));
1107     op = initiate_remote_stonith_op(client, msg, TRUE);
1108     if (op == NULL) {
1109         return EPROTO;
1110     }
1111     op->state = st_done;
1112     set_fencing_completed(op);
1113     op->delegate = strdup("a human");
1114 
1115     // For the fencer's purposes, the fencing operation is done
1116     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1117     finalize_op(op, msg, false);
1118 
1119     /* For the requester's purposes, the operation is still pending. The
1120      * actual result will be sent asynchronously via the operation's done_cb().
1121      */
1122     return EINPROGRESS;
1123 }
1124 
1125 /*!
1126  * \internal
1127  * \brief Create a new remote stonith operation
1128  *
1129  * \param[in] client   ID of local stonith client that initiated the operation
1130  * \param[in] request  The request from the client that started the operation
1131  * \param[in] peer     TRUE if this operation is owned by another stonith peer
1132  *                     (an operation owned by one peer is stored on all peers,
1133  *                     but only the owner executes it; all nodes get the results
1134  *                     once the owner finishes execution)
1135  */
1136 void *
1137 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1138 {
1139     remote_fencing_op_t *op = NULL;
1140     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
1141     int call_options = 0;
1142     const char *operation = NULL;
1143 
1144     init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1145 
1146     /* If this operation is owned by another node, check to make
1147      * sure we haven't already created this operation. */
1148     if (peer && dev) {
1149         const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1150 
1151         CRM_CHECK(op_id != NULL, return NULL);
1152 
1153         op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1154         if (op) {
1155             crm_debug("Reusing existing remote fencing op %.8s for %s",
1156                       op_id, ((client == NULL)? "unknown client" : client));
1157             return op;
1158         }
1159     }
1160 
1161     op = calloc(1, sizeof(remote_fencing_op_t));
1162     CRM_ASSERT(op != NULL);
1163 
1164     crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1165     // Value -1 means disable any static/random fencing delays
1166     crm_element_value_int(request, F_STONITH_DELAY, &(op->client_delay));
1167 
1168     if (peer && dev) {
1169         op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1170     } else {
1171         op->id = crm_generate_uuid();
1172     }
1173 
1174     g_hash_table_replace(stonith_remote_op_list, op->id, op);
1175 
1176     op->state = st_query;
1177     op->replies_expected = fencing_active_peers();
1178     op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1179     op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1180     op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE); /* May not be set */
1181     op->created = time(NULL);
1182 
1183     if (op->originator == NULL) {
1184         /* Local or relayed request */
1185         op->originator = strdup(stonith_our_uname);
1186     }
1187 
1188     CRM_LOG_ASSERT(client != NULL);
1189     if (client) {
1190         op->client_id = strdup(client);
1191     }
1192 
1193 
1194     /* For a RELAY operation, set fenced on the client. */
1195     operation = crm_element_value(request, F_STONITH_OPERATION);
1196 
1197     if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1198         op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1199                                          (unsigned long) getpid());
1200     } else {
1201         op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1202     }
1203 
1204     op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1205     op->request = copy_xml(request);    /* TODO: Figure out how to avoid this */
1206     crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1207     op->call_options = call_options;
1208 
1209     crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1210 
1211     crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1212               "base timeout %d, %u %s expected)",
1213               (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1214               op->target, op->client_name, op->base_timeout,
1215               op->replies_expected,
1216               pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1217 
1218     if (op->call_options & st_opt_cs_nodeid) {
1219         int nodeid;
1220         crm_node_t *node;
1221 
1222         pcmk__scan_min_int(op->target, &nodeid, 0);
1223         node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
1224 
1225         /* Ensure the conversion only happens once */
1226         stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1227 
1228         if (node && node->uname) {
1229             free(op->target);
1230             op->target = strdup(node->uname);
1231 
1232         } else {
1233             crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1234         }
1235     }
1236 
1237     /* check to see if this is a duplicate operation of another in-flight operation */
1238     merge_duplicates(op);
1239 
1240     if (op->state != st_duplicate) {
1241         /* kick history readers */
1242         fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
1243     }
1244 
1245     /* safe to trim as long as that doesn't touch pending ops */
1246     stonith_fence_history_trim();
1247 
1248     return op;
1249 }
1250 
1251 /*!
1252  * \internal
1253  * \brief Create a peer fencing operation from a request, and initiate it
1254  *
1255  * \param[in] client     IPC client that made request (NULL to get from request)
1256  * \param[in] request    Request XML
1257  * \param[in] manual_ack Whether this is a manual action confirmation
1258  *
1259  * \return Newly created operation on success, otherwise NULL
1260  */
1261 remote_fencing_op_t *
1262 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
     /* [previous][next][first][last][top][bottom][index][help] */
1263                            gboolean manual_ack)
1264 {
1265     int query_timeout = 0;
1266     xmlNode *query = NULL;
1267     const char *client_id = NULL;
1268     remote_fencing_op_t *op = NULL;
1269     const char *relay_op_id = NULL;
1270     const char *operation = NULL;
1271 
1272     if (client) {
1273         client_id = client->id;
1274     } else {
1275         client_id = crm_element_value(request, F_STONITH_CLIENTID);
1276     }
1277 
1278     CRM_LOG_ASSERT(client_id != NULL);
1279     op = create_remote_stonith_op(client_id, request, FALSE);
1280     op->owner = TRUE;
1281     if (manual_ack) {
1282         return op;
1283     }
1284 
1285     CRM_CHECK(op->action, return NULL);
1286 
1287     if (advance_topology_level(op, true) != pcmk_rc_ok) {
1288         op->state = st_failed;
1289     }
1290 
1291     switch (op->state) {
1292         case st_failed:
1293             // advance_topology_level() exhausted levels
1294             pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1295                              "All topology levels failed");
1296             crm_warn("Could not request peer fencing (%s) targeting %s "
1297                      CRM_XS " id=%.8s", op->action, op->target, op->id);
1298             finalize_op(op, NULL, false);
1299             return op;
1300 
1301         case st_duplicate:
1302             crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1303                      CRM_XS " id=%.8s", op->action, op->target, op->id);
1304             return op;
1305 
1306         default:
1307             crm_notice("Requesting peer fencing (%s) targeting %s "
1308                        CRM_XS " id=%.8s state=%s base_timeout=%d",
1309                        op->action, op->target, op->id,
1310                        stonith_op_state_str(op->state), op->base_timeout);
1311     }
1312 
1313     query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1314                               NULL, op->call_options);
1315 
1316     crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1317     crm_xml_add(query, F_STONITH_TARGET, op->target);
1318     crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1319     crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1320     crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1321     crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1322     crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1323 
1324     /* In case of RELAY operation, RELAY information is added to the query to delete the original operation of RELAY. */
1325     operation = crm_element_value(request, F_STONITH_OPERATION);
1326     if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1327         relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1328         if (relay_op_id) {
1329             crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1330         }
1331     }
1332 
1333     send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1334     free_xml(query);
1335 
1336     query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1337     op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1338 
1339     return op;
1340 }
1341 
1342 enum find_best_peer_options {
1343     /*! Skip checking the target peer for capable fencing devices */
1344     FIND_PEER_SKIP_TARGET = 0x0001,
1345     /*! Only check the target peer for capable fencing devices */
1346     FIND_PEER_TARGET_ONLY = 0x0002,
1347     /*! Skip peers and devices that are not verified */
1348     FIND_PEER_VERIFIED_ONLY = 0x0004,
1349 };
1350 
1351 static peer_device_info_t *
1352 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
     /* [previous][next][first][last][top][bottom][index][help] */
1353 {
1354     GList *iter = NULL;
1355     gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1356 
1357     if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1358         return NULL;
1359     }
1360 
1361     for (iter = op->query_results; iter != NULL; iter = iter->next) {
1362         peer_device_info_t *peer = iter->data;
1363 
1364         crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1365                   peer->host, op->target, peer->ndevices,
1366                   pcmk__plural_s(peer->ndevices), peer->tried, options);
1367         if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1368             continue;
1369         }
1370         if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1371             continue;
1372         }
1373 
1374         if (pcmk_is_set(op->call_options, st_opt_topology)) {
1375 
1376             if (grab_peer_device(op, peer, device, verified_devices_only)) {
1377                 return peer;
1378             }
1379 
1380         } else if (!peer->tried
1381                    && count_peer_devices(op, peer, verified_devices_only,
1382                                          fenced_support_flag(op->action))) {
1383             /* No topology: Use the current best peer */
1384             crm_trace("Simple fencing");
1385             return peer;
1386         }
1387     }
1388 
1389     return NULL;
1390 }
1391 
1392 static peer_device_info_t *
1393 stonith_choose_peer(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
1394 {
1395     const char *device = NULL;
1396     peer_device_info_t *peer = NULL;
1397     uint32_t active = fencing_active_peers();
1398 
1399     do {
1400         if (op->devices) {
1401             device = op->devices->data;
1402             crm_trace("Checking for someone to fence (%s) %s using %s",
1403                       op->action, op->target, device);
1404         } else {
1405             crm_trace("Checking for someone to fence (%s) %s",
1406                       op->action, op->target);
1407         }
1408 
1409         /* Best choice is a peer other than the target with verified access */
1410         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1411         if (peer) {
1412             crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1413             return peer;
1414         }
1415 
1416         if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1417             crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1418             return NULL;
1419         }
1420 
1421         /* If no other peer has verified access, next best is unverified access */
1422         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1423         if (peer) {
1424             crm_trace("Found best unverified peer %s", peer->host);
1425             return peer;
1426         }
1427 
1428         /* If no other peer can do it, last option is self-fencing
1429          * (which is never allowed for the "on" phase of a remapped reboot)
1430          */
1431         if (op->phase != st_phase_on) {
1432             peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1433             if (peer) {
1434                 crm_trace("%s will fence itself", peer->host);
1435                 return peer;
1436             }
1437         }
1438 
1439         /* Try the next fencing level if there is one (unless we're in the "on"
1440          * phase of a remapped "reboot", because we ignore errors in that case)
1441          */
1442     } while ((op->phase != st_phase_on)
1443              && pcmk_is_set(op->call_options, st_opt_topology)
1444              && (advance_topology_level(op, false) == pcmk_rc_ok));
1445 
1446     if ((stonith_watchdog_timeout_ms > 0)
1447         && pcmk__is_fencing_action(op->action)
1448         && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
1449         && node_does_watchdog_fencing(op->target)) {
1450         crm_info("Couldn't contact watchdog-fencing target-node (%s)",
1451                  op->target);
1452         /* check_watchdog_fencing_and_wait will log additional info */
1453     } else {
1454         crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1455                    op->action, op->target, (device? device : "any device"));
1456     }
1457     return NULL;
1458 }
1459 
1460 static int
1461 get_device_timeout(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
1462                    const peer_device_info_t *peer, const char *device,
1463                    bool with_delay)
1464 {
1465     device_properties_t *props;
1466     int delay = 0;
1467 
1468     if (!peer || !device) {
1469         return op->base_timeout;
1470     }
1471 
1472     props = g_hash_table_lookup(peer->devices, device);
1473     if (!props) {
1474         return op->base_timeout;
1475     }
1476 
1477     // op->client_delay < 0 means disable any static/random fencing delays
1478     if (with_delay && (op->client_delay >= 0)) {
1479         // delay_base is eventually limited by delay_max
1480         delay = (props->delay_max[op->phase] > 0 ?
1481                  props->delay_max[op->phase] : props->delay_base[op->phase]);
1482     }
1483 
1484     return (props->custom_action_timeout[op->phase]?
1485             props->custom_action_timeout[op->phase] : op->base_timeout)
1486            + delay;
1487 }
1488 
1489 struct timeout_data {
1490     const remote_fencing_op_t *op;
1491     const peer_device_info_t *peer;
1492     int total_timeout;
1493 };
1494 
1495 /*!
1496  * \internal
1497  * \brief Add timeout to a total if device has not been executed yet
1498  *
1499  * \param[in]     key        GHashTable key (device ID)
1500  * \param[in]     value      GHashTable value (device properties)
1501  * \param[in,out] user_data  Timeout data
1502  */
1503 static void
1504 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1505 {
1506     const char *device_id = key;
1507     device_properties_t *props = value;
1508     struct timeout_data *timeout = user_data;
1509 
1510     if (!props->executed[timeout->op->phase]
1511         && !props->disallowed[timeout->op->phase]) {
1512         timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer,
1513                                                      device_id, true);
1514     }
1515 }
1516 
1517 static int
1518 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1519 {
1520     struct timeout_data timeout;
1521 
1522     timeout.op = op;
1523     timeout.peer = peer;
1524     timeout.total_timeout = 0;
1525 
1526     g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1527 
1528     return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1529 }
1530 
1531 static int
1532 get_op_total_timeout(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
1533                      const peer_device_info_t *chosen_peer)
1534 {
1535     int total_timeout = 0;
1536     stonith_topology_t *tp = find_topology_for_host(op->target);
1537 
1538     if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1539         int i;
1540         GList *device_list = NULL;
1541         GList *iter = NULL;
1542         GList *auto_list = NULL;
1543 
1544         if (pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)
1545             && (op->automatic_list != NULL)) {
1546             auto_list = g_list_copy(op->automatic_list);
1547         }
1548 
1549         /* Yep, this looks scary, nested loops all over the place.
1550          * Here is what is going on.
1551          * Loop1: Iterate through fencing levels.
1552          * Loop2: If a fencing level has devices, loop through each device
1553          * Loop3: For each device in a fencing level, see what peer owns it
1554          *        and what that peer has reported the timeout is for the device.
1555          */
1556         for (i = 0; i < ST_LEVEL_MAX; i++) {
1557             if (!tp->levels[i]) {
1558                 continue;
1559             }
1560             for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1561                 /* in case of watchdog-device we add the timeout to the budget
1562                    regardless of if we got a reply or not
1563                  */
1564                 if ((stonith_watchdog_timeout_ms > 0)
1565                     && pcmk__is_fencing_action(op->action)
1566                     && pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID,
1567                                     pcmk__str_none)
1568                     && node_does_watchdog_fencing(op->target)) {
1569                     total_timeout += stonith_watchdog_timeout_ms / 1000;
1570                     continue;
1571                 }
1572 
1573                 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1574                     const peer_device_info_t *peer = iter->data;
1575 
1576                     if (auto_list) {
1577                         GList *match = g_list_find_custom(auto_list, device_list->data,
1578                                         sort_strings);
1579                         if (match) {
1580                             auto_list = g_list_remove(auto_list, match->data);
1581                         }
1582                     }
1583 
1584                     if (find_peer_device(op, peer, device_list->data,
1585                                          fenced_support_flag(op->action))) {
1586                         total_timeout += get_device_timeout(op, peer,
1587                                                             device_list->data,
1588                                                             true);
1589                         break;
1590                     }
1591                 }               /* End Loop3: match device with peer that owns device, find device's timeout period */
1592             }                   /* End Loop2: iterate through devices at a specific level */
1593         }                       /*End Loop1: iterate through fencing levels */
1594 
1595         //Add only exists automatic_list device timeout
1596         if (auto_list) {
1597             for (iter = auto_list; iter != NULL; iter = iter->next) {
1598                 GList *iter2 = NULL;
1599 
1600                 for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
1601                     peer_device_info_t *peer = iter2->data;
1602                     if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
1603                         total_timeout += get_device_timeout(op, peer,
1604                                                             iter->data, true);
1605                         break;
1606                     }
1607                 }
1608             }
1609         }
1610 
1611         g_list_free(auto_list);
1612 
1613     } else if (chosen_peer) {
1614         total_timeout = get_peer_timeout(op, chosen_peer);
1615     } else {
1616         total_timeout = op->base_timeout;
1617     }
1618 
1619     /* Take any requested fencing delay into account to prevent it from eating
1620      * up the total timeout.
1621      */
1622     return ((total_timeout ? total_timeout : op->base_timeout)
1623             + ((op->client_delay > 0)? op->client_delay : 0));
1624 }
1625 
1626 static void
1627 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1628 {
1629     GList *iter = NULL;
1630     xmlNode *update = NULL;
1631     const char *client_node = NULL;
1632     const char *client_id = NULL;
1633     const char *call_id = NULL;
1634 
1635     if (op->call_options & st_opt_sync_call) {
1636         /* There is no reason to report the timeout for a synchronous call. It
1637          * is impossible to use the reported timeout to do anything when the client
1638          * is blocking for the response.  This update is only important for
1639          * async calls that require a callback to report the results in. */
1640         return;
1641     } else if (!op->request) {
1642         return;
1643     }
1644 
1645     crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1646     client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1647     call_id = crm_element_value(op->request, F_STONITH_CALLID);
1648     client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1649     if (!client_node || !call_id || !client_id) {
1650         return;
1651     }
1652 
1653     if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1654         // Client is connected to this node, so send update directly to them
1655         do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1656         return;
1657     }
1658 
1659     /* The client is connected to another node, relay this update to them */
1660     update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1661     crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1662     crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1663     crm_xml_add(update, F_STONITH_CALLID, call_id);
1664     crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1665 
1666     send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1667 
1668     free_xml(update);
1669 
1670     for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1671         remote_fencing_op_t *dup = iter->data;
1672 
1673         crm_trace("Reporting timeout for duplicate %.8s to client %s",
1674                   dup->id, dup->client_name);
1675         report_timeout_period(iter->data, op_timeout);
1676     }
1677 }
1678 
1679 /*!
1680  * \internal
1681  * \brief Advance an operation to the next device in its topology
1682  *
1683  * \param[in,out] op      Fencer operation to advance
1684  * \param[in]     device  ID of device that just completed
1685  * \param[in,out] msg     If not NULL, XML reply of last delegated operation
1686  */
1687 static void
1688 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
     /* [previous][next][first][last][top][bottom][index][help] */
1689                                  xmlNode *msg)
1690 {
1691     /* Advance to the next device at this topology level, if any */
1692     if (op->devices) {
1693         op->devices = op->devices->next;
1694     }
1695 
1696     /* Handle automatic unfencing if an "on" action was requested */
1697     if ((op->phase == st_phase_requested)
1698         && pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)) {
1699         /* If the device we just executed was required, it's not anymore */
1700         remove_required_device(op, device);
1701 
1702         /* If there are no more devices at this topology level, run through any
1703          * remaining devices with automatic unfencing
1704          */
1705         if (op->devices == NULL) {
1706             op->devices = op->automatic_list;
1707         }
1708     }
1709 
1710     if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1711         /* We're done with this level and with required devices, but we had
1712          * remapped "reboot" to "off", so start over with "on". If any devices
1713          * need to be turned back on, op->devices will be non-NULL after this.
1714          */
1715         op_phase_on(op);
1716     }
1717 
1718     // This function is only called if the previous device succeeded
1719     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1720 
1721     if (op->devices) {
1722         /* Necessary devices remain, so execute the next one */
1723         crm_trace("Next targeting %s on behalf of %s@%s",
1724                   op->target, op->client_name, op->originator);
1725 
1726         // The requested delay has been applied for the first device
1727         if (op->client_delay > 0) {
1728             op->client_delay = 0;
1729         }
1730 
1731         request_peer_fencing(op, NULL);
1732     } else {
1733         /* We're done with all devices and phases, so finalize operation */
1734         crm_trace("Marking complex fencing op targeting %s as complete",
1735                   op->target);
1736         op->state = st_done;
1737         finalize_op(op, msg, false);
1738     }
1739 }
1740 
1741 static gboolean
1742 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
1743 {
1744     if (node_does_watchdog_fencing(op->target)) {
1745 
1746         crm_notice("Waiting %lds for %s to self-fence (%s) for "
1747                    "client %s " CRM_XS " id=%.8s",
1748                    (stonith_watchdog_timeout_ms / 1000),
1749                    op->target, op->action, op->client_name, op->id);
1750 
1751         if (op->op_timer_one) {
1752             g_source_remove(op->op_timer_one);
1753         }
1754         op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
1755                                          remote_op_watchdog_done, op);
1756         return TRUE;
1757     } else {
1758         crm_debug("Skipping fallback to watchdog-fencing as %s is "
1759                  "not in host-list", op->target);
1760     }
1761     return FALSE;
1762 }
1763 
1764 /*!
1765  * \internal
1766  * \brief Ask a peer to execute a fencing operation
1767  *
1768  * \param[in,out] op      Fencing operation to be executed
1769  * \param[in,out] peer    If NULL or topology is in use, choose best peer to
1770  *                        execute the fencing, otherwise use this peer
1771  */
1772 static void
1773 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1774 {
1775     const char *device = NULL;
1776     int timeout;
1777 
1778     CRM_CHECK(op != NULL, return);
1779 
1780     crm_trace("Action %.8s targeting %s for %s is %s",
1781               op->id, op->target, op->client_name,
1782               stonith_op_state_str(op->state));
1783 
1784     if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1785         /* We are in the "on" phase of a remapped topology reboot. If this
1786          * device has pcmk_reboot_action="off", or doesn't support the "on"
1787          * action, skip it.
1788          *
1789          * We can't check device properties at this point because we haven't
1790          * chosen a peer for this stage yet. Instead, we check the local node's
1791          * knowledge about the device. If different versions of the fence agent
1792          * are installed on different nodes, there's a chance this could be
1793          * mistaken, but the worst that could happen is we don't try turning the
1794          * node back on when we should.
1795          */
1796         device = op->devices->data;
1797         if (pcmk__str_eq(fenced_device_reboot_action(device), PCMK_ACTION_OFF,
1798                          pcmk__str_none)) {
1799             crm_info("Not turning %s back on using %s because the device is "
1800                      "configured to stay off (pcmk_reboot_action='off')",
1801                      op->target, device);
1802             advance_topology_device_in_level(op, device, NULL);
1803             return;
1804         }
1805         if (!fenced_device_supports_on(device)) {
1806             crm_info("Not turning %s back on using %s because the agent "
1807                      "doesn't support 'on'", op->target, device);
1808             advance_topology_device_in_level(op, device, NULL);
1809             return;
1810         }
1811     }
1812 
1813     timeout = op->base_timeout;
1814     if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1815         peer = stonith_choose_peer(op);
1816     }
1817 
1818     if (!op->op_timer_total) {
1819         op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1820         op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1821         report_timeout_period(op, op->total_timeout);
1822         crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1823                  CRM_XS "id=%.8s",
1824                  op->total_timeout, op->target, op->client_name, op->id);
1825     }
1826 
1827     if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1828         /* Ignore the caller's peer preference if topology is in use, because
1829          * that peer might not have access to the required device. With
1830          * topology, stonith_choose_peer() removes the device from further
1831          * consideration, so the timeout must be calculated beforehand.
1832          *
1833          * @TODO Basing the total timeout on the caller's preferred peer (above)
1834          *       is less than ideal.
1835          */
1836         peer = stonith_choose_peer(op);
1837 
1838         device = op->devices->data;
1839         /* Fencing timeout sent to peer takes no delay into account.
1840          * The peer will add a dedicated timer for any delay upon
1841          * schedule_stonith_command().
1842          */
1843         timeout = get_device_timeout(op, peer, device, false);
1844     }
1845 
1846     if (peer) {
1847         int timeout_one = 0;
1848         xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1849 
1850         if (op->client_delay > 0) {
1851            /* Take requested fencing delay into account to prevent it from
1852             * eating up the timeout.
1853             */
1854             timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay;
1855         }
1856 
1857         crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1858         crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1859         crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1860         crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1861         crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1862         crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1863         crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1864         crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1865         crm_xml_add_int(remote_op, F_STONITH_DELAY, op->client_delay);
1866 
1867         if (device) {
1868             timeout_one += TIMEOUT_MULTIPLY_FACTOR *
1869                            get_device_timeout(op, peer, device, true);
1870             crm_notice("Requesting that %s perform '%s' action targeting %s "
1871                        "using %s " CRM_XS " for client %s (%ds)",
1872                        peer->host, op->action, op->target, device,
1873                        op->client_name, timeout_one);
1874             crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1875 
1876         } else {
1877             timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1878             crm_notice("Requesting that %s perform '%s' action targeting %s "
1879                        CRM_XS " for client %s (%ds, %lds)",
1880                        peer->host, op->action, op->target, op->client_name,
1881                        timeout_one, stonith_watchdog_timeout_ms);
1882         }
1883 
1884         op->state = st_exec;
1885         if (op->op_timer_one) {
1886             g_source_remove(op->op_timer_one);
1887             op->op_timer_one = 0;
1888         }
1889 
1890         if (!((stonith_watchdog_timeout_ms > 0)
1891               && (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
1892                   || (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1893                       && pcmk__is_fencing_action(op->action)))
1894               && check_watchdog_fencing_and_wait(op))) {
1895 
1896             /* Some thoughts about self-fencing cases reaching this point:
1897                - Actually check in check_watchdog_fencing_and_wait
1898                  shouldn't fail if STONITH_WATCHDOG_ID is
1899                  chosen as fencing-device and it being present implies
1900                  watchdog-fencing is enabled anyway
1901                - If watchdog-fencing is disabled either in general or for
1902                  a specific target - detected in check_watchdog_fencing_and_wait -
1903                  for some other kind of self-fencing we can't expect
1904                  a success answer but timeout is fine if the node doesn't
1905                  come back in between
1906                - Delicate might be the case where we have watchdog-fencing
1907                  enabled for a node but the watchdog-fencing-device isn't
1908                  explicitly chosen for suicide. Local pe-execution in sbd
1909                  may detect the node as unclean and lead to timely suicide.
1910                  Otherwise the selection of stonith-watchdog-timeout at
1911                  least is questionable.
1912              */
1913 
1914             /* coming here we're not waiting for watchdog timeout -
1915                thus engage timer with timout evaluated before */
1916             op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1917         }
1918 
1919         send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1920         peer->tried = TRUE;
1921         free_xml(remote_op);
1922         return;
1923 
1924     } else if (op->phase == st_phase_on) {
1925         /* A remapped "on" cannot be executed, but the node was already
1926          * turned off successfully, so ignore the error and continue.
1927          */
1928         crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1929                  "after successful 'off'", device, op->target);
1930         advance_topology_device_in_level(op, device, NULL);
1931         return;
1932 
1933     } else if (op->owner == FALSE) {
1934         crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1935                 op->action, op->target, op->client_name);
1936 
1937     } else if (op->query_timer == 0) {
1938         /* We've exhausted all available peers */
1939         crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1940                  CRM_XS " state=%s", op->action, op->target, op->client_name,
1941                  stonith_op_state_str(op->state));
1942         CRM_CHECK(op->state < st_done, return);
1943         finalize_timed_out_op(op, "All nodes failed, or are unable, to "
1944                                   "fence target");
1945 
1946     } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1947         /* if the operation never left the query state,
1948          * but we have all the expected replies, then no devices
1949          * are available to execute the fencing operation. */
1950 
1951         if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
1952            STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
1953             if (check_watchdog_fencing_and_wait(op)) {
1954                 return;
1955             }
1956         }
1957 
1958         if (op->state == st_query) {
1959             crm_info("No peers (out of %d) have devices capable of fencing "
1960                      "(%s) %s for client %s " CRM_XS " state=%s",
1961                      op->replies, op->action, op->target, op->client_name,
1962                      stonith_op_state_str(op->state));
1963 
1964             pcmk__reset_result(&op->result);
1965             pcmk__set_result(&op->result, CRM_EX_ERROR,
1966                              PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1967         } else {
1968             if (pcmk_is_set(op->call_options, st_opt_topology)) {
1969                 pcmk__reset_result(&op->result);
1970                 pcmk__set_result(&op->result, CRM_EX_ERROR,
1971                                  PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1972             }
1973             /* ... else use existing result from previous failed attempt
1974              * (topology is not in use, and no devices remain to be attempted).
1975              * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would
1976              * prevent finalize_op() from setting the correct delegate if
1977              * needed.
1978              */
1979 
1980             crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1981                      "for client %s " CRM_XS " state=%s",
1982                      op->replies, op->action, op->target, op->client_name,
1983                      stonith_op_state_str(op->state));
1984         }
1985 
1986         op->state = st_failed;
1987         finalize_op(op, NULL, false);
1988 
1989     } else {
1990         crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1991                  "for client %s " CRM_XS " id=%.8s",
1992                  op->action, op->target, (device? " using " : ""),
1993                  (device? device : ""), op->client_name, op->id);
1994     }
1995 }
1996 
1997 /*!
1998  * \internal
1999  * \brief Comparison function for sorting query results
2000  *
2001  * \param[in] a  GList item to compare
2002  * \param[in] b  GList item to compare
2003  *
2004  * \return Per the glib documentation, "a negative integer if the first value
2005  *         comes before the second, 0 if they are equal, or a positive integer
2006  *         if the first value comes after the second."
2007  */
2008 static gint
2009 sort_peers(gconstpointer a, gconstpointer b)
     /* [previous][next][first][last][top][bottom][index][help] */
2010 {
2011     const peer_device_info_t *peer_a = a;
2012     const peer_device_info_t *peer_b = b;
2013 
2014     return (peer_b->ndevices - peer_a->ndevices);
2015 }
2016 
2017 /*!
2018  * \internal
2019  * \brief Determine if all the devices in the topology are found or not
2020  *
2021  * \param[in] op  Fencing operation with topology to check
2022  */
2023 static gboolean
2024 all_topology_devices_found(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
2025 {
2026     GList *device = NULL;
2027     GList *iter = NULL;
2028     device_properties_t *match = NULL;
2029     stonith_topology_t *tp = NULL;
2030     gboolean skip_target = FALSE;
2031     int i;
2032 
2033     tp = find_topology_for_host(op->target);
2034     if (!tp) {
2035         return FALSE;
2036     }
2037     if (pcmk__is_fencing_action(op->action)) {
2038         /* Don't count the devices on the target node if we are killing
2039          * the target node. */
2040         skip_target = TRUE;
2041     }
2042 
2043     for (i = 0; i < ST_LEVEL_MAX; i++) {
2044         for (device = tp->levels[i]; device; device = device->next) {
2045             match = NULL;
2046             for (iter = op->query_results; iter && !match; iter = iter->next) {
2047                 peer_device_info_t *peer = iter->data;
2048 
2049                 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
2050                     continue;
2051                 }
2052                 match = find_peer_device(op, peer, device->data, st_device_supports_none);
2053             }
2054             if (!match) {
2055                 return FALSE;
2056             }
2057         }
2058     }
2059 
2060     return TRUE;
2061 }
2062 
2063 /*!
2064  * \internal
2065  * \brief Parse action-specific device properties from XML
2066  *
2067  * \param[in]     xml     XML element containing the properties
2068  * \param[in]     peer    Name of peer that sent XML (for logs)
2069  * \param[in]     device  Device ID (for logs)
2070  * \param[in]     action  Action the properties relate to (for logs)
2071  * \param[in,out] op      Fencing operation that properties are being parsed for
2072  * \param[in]     phase   Phase the properties relate to
2073  * \param[in,out] props   Device properties to update
2074  */
2075 static void
2076 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
     /* [previous][next][first][last][top][bottom][index][help] */
2077                       const char *action, remote_fencing_op_t *op,
2078                       enum st_remap_phase phase, device_properties_t *props)
2079 {
2080     props->custom_action_timeout[phase] = 0;
2081     crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
2082                           &props->custom_action_timeout[phase]);
2083     if (props->custom_action_timeout[phase]) {
2084         crm_trace("Peer %s with device %s returned %s action timeout %d",
2085                   peer, device, action, props->custom_action_timeout[phase]);
2086     }
2087 
2088     props->delay_max[phase] = 0;
2089     crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
2090     if (props->delay_max[phase]) {
2091         crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
2092                   peer, device, props->delay_max[phase], action);
2093     }
2094 
2095     props->delay_base[phase] = 0;
2096     crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
2097     if (props->delay_base[phase]) {
2098         crm_trace("Peer %s with device %s returned base delay %d for %s",
2099                   peer, device, props->delay_base[phase], action);
2100     }
2101 
2102     /* Handle devices with automatic unfencing */
2103     if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
2104         int required = 0;
2105 
2106         crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
2107         if (required) {
2108             crm_trace("Peer %s requires device %s to execute for action %s",
2109                       peer, device, action);
2110             add_required_device(op, device);
2111         }
2112     }
2113 
2114     /* If a reboot is remapped to off+on, it's possible that a node is allowed
2115      * to perform one action but not another.
2116      */
2117     if (pcmk__xe_attr_is_true(xml, F_STONITH_ACTION_DISALLOWED)) {
2118         props->disallowed[phase] = TRUE;
2119         crm_trace("Peer %s is disallowed from executing %s for device %s",
2120                   peer, action, device);
2121     }
2122 }
2123 
2124 /*!
2125  * \internal
2126  * \brief Parse one device's properties from peer's XML query reply
2127  *
2128  * \param[in]     xml       XML node containing device properties
2129  * \param[in,out] op        Operation that query and reply relate to
2130  * \param[in,out] peer      Peer's device information
2131  * \param[in]     device    ID of device being parsed
2132  */
2133 static void
2134 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
2135                       peer_device_info_t *peer, const char *device)
2136 {
2137     xmlNode *child;
2138     int verified = 0;
2139     device_properties_t *props = calloc(1, sizeof(device_properties_t));
2140     int flags = st_device_supports_on; /* Old nodes that don't set the flag assume they support the on action */
2141 
2142     /* Add a new entry to this peer's devices list */
2143     CRM_ASSERT(props != NULL);
2144     g_hash_table_insert(peer->devices, strdup(device), props);
2145 
2146     /* Peers with verified (monitored) access will be preferred */
2147     crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
2148     if (verified) {
2149         crm_trace("Peer %s has confirmed a verified device %s",
2150                   peer->host, device);
2151         props->verified = TRUE;
2152     }
2153 
2154     crm_element_value_int(xml, F_STONITH_DEVICE_SUPPORT_FLAGS, &flags);
2155     props->device_support_flags = flags;
2156 
2157     /* Parse action-specific device properties */
2158     parse_action_specific(xml, peer->host, device, op_requested_action(op),
2159                           op, st_phase_requested, props);
2160     for (child = pcmk__xml_first_child(xml); child != NULL;
2161          child = pcmk__xml_next(child)) {
2162         /* Replies for "reboot" operations will include the action-specific
2163          * values for "off" and "on" in child elements, just in case the reboot
2164          * winds up getting remapped.
2165          */
2166         if (pcmk__str_eq(ID(child), PCMK_ACTION_OFF, pcmk__str_none)) {
2167             parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF,
2168                                   op, st_phase_off, props);
2169         } else if (pcmk__str_eq(ID(child), PCMK_ACTION_ON, pcmk__str_none)) {
2170             parse_action_specific(child, peer->host, device, PCMK_ACTION_ON,
2171                                   op, st_phase_on, props);
2172         }
2173     }
2174 }
2175 
2176 /*!
2177  * \internal
2178  * \brief Parse a peer's XML query reply and add it to operation's results
2179  *
2180  * \param[in,out] op        Operation that query and reply relate to
2181  * \param[in]     host      Name of peer that sent this reply
2182  * \param[in]     ndevices  Number of devices expected in reply
2183  * \param[in]     xml       XML node containing device list
2184  *
2185  * \return Newly allocated result structure with parsed reply
2186  */
2187 static peer_device_info_t *
2188 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
     /* [previous][next][first][last][top][bottom][index][help] */
2189            const xmlNode *xml)
2190 {
2191     peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t));
2192     xmlNode *child;
2193 
2194     // cppcheck seems not to understand the abort logic in CRM_CHECK
2195     // cppcheck-suppress memleak
2196     CRM_CHECK(peer != NULL, return NULL);
2197     peer->host = strdup(host);
2198     peer->devices = pcmk__strkey_table(free, free);
2199 
2200     /* Each child element describes one capable device available to the peer */
2201     for (child = pcmk__xml_first_child(xml); child != NULL;
2202          child = pcmk__xml_next(child)) {
2203         const char *device = ID(child);
2204 
2205         if (device) {
2206             add_device_properties(child, op, peer, device);
2207         }
2208     }
2209 
2210     peer->ndevices = g_hash_table_size(peer->devices);
2211     CRM_CHECK(ndevices == peer->ndevices,
2212               crm_err("Query claimed to have %d device%s but %d found",
2213                       ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2214 
2215     op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2216     return peer;
2217 }
2218 
2219 /*!
2220  * \internal
2221  * \brief Handle a peer's reply to our fencing query
2222  *
2223  * Parse a query result from XML and store it in the remote operation
2224  * table, and when enough replies have been received, issue a fencing request.
2225  *
2226  * \param[in] msg  XML reply received
2227  *
2228  * \return pcmk_ok on success, -errno on error
2229  *
2230  * \note See initiate_remote_stonith_op() for how the XML query was initially
2231  *       formed, and stonith_query() for how the peer formed its XML reply.
2232  */
2233 int
2234 process_remote_stonith_query(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
2235 {
2236     int ndevices = 0;
2237     gboolean host_is_target = FALSE;
2238     gboolean have_all_replies = FALSE;
2239     const char *id = NULL;
2240     const char *host = NULL;
2241     remote_fencing_op_t *op = NULL;
2242     peer_device_info_t *peer = NULL;
2243     uint32_t replies_expected;
2244     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2245 
2246     CRM_CHECK(dev != NULL, return -EPROTO);
2247 
2248     id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2249     CRM_CHECK(id != NULL, return -EPROTO);
2250 
2251     dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
2252     CRM_CHECK(dev != NULL, return -EPROTO);
2253     crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
2254 
2255     op = g_hash_table_lookup(stonith_remote_op_list, id);
2256     if (op == NULL) {
2257         crm_debug("Received query reply for unknown or expired operation %s",
2258                   id);
2259         return -EOPNOTSUPP;
2260     }
2261 
2262     replies_expected = fencing_active_peers();
2263     if (op->replies_expected < replies_expected) {
2264         replies_expected = op->replies_expected;
2265     }
2266     if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2267         have_all_replies = TRUE;
2268     }
2269     host = crm_element_value(msg, F_ORIG);
2270     host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2271 
2272     crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2273              op->replies, replies_expected, host,
2274              op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2275     if (ndevices > 0) {
2276         peer = add_result(op, host, ndevices, dev);
2277     }
2278 
2279     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2280 
2281     if (pcmk_is_set(op->call_options, st_opt_topology)) {
2282         /* If we start the fencing before all the topology results are in,
2283          * it is possible fencing levels will be skipped because of the missing
2284          * query results. */
2285         if (op->state == st_query && all_topology_devices_found(op)) {
2286             /* All the query results are in for the topology, start the fencing ops. */
2287             crm_trace("All topology devices found");
2288             request_peer_fencing(op, peer);
2289 
2290         } else if (have_all_replies) {
2291             crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2292                      replies_expected, op->replies);
2293             request_peer_fencing(op, NULL);
2294         }
2295 
2296     } else if (op->state == st_query) {
2297         int nverified = count_peer_devices(op, peer, TRUE,
2298                                            fenced_support_flag(op->action));
2299 
2300         /* We have a result for a non-topology fencing op that looks promising,
2301          * go ahead and start fencing before query timeout */
2302         if ((peer != NULL) && !host_is_target && nverified) {
2303             /* we have a verified device living on a peer that is not the target */
2304             crm_trace("Found %d verified device%s",
2305                       nverified, pcmk__plural_s(nverified));
2306             request_peer_fencing(op, peer);
2307 
2308         } else if (have_all_replies) {
2309             crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2310                      replies_expected, op->replies);
2311             request_peer_fencing(op, NULL);
2312 
2313         } else {
2314             crm_trace("Waiting for more peer results before launching fencing operation");
2315         }
2316 
2317     } else if ((peer != NULL) && (op->state == st_done)) {
2318         crm_info("Discarding query result from %s (%d device%s): "
2319                  "Operation is %s", peer->host,
2320                  peer->ndevices, pcmk__plural_s(peer->ndevices),
2321                  stonith_op_state_str(op->state));
2322     }
2323 
2324     return pcmk_ok;
2325 }
2326 
2327 /*!
2328  * \internal
2329  * \brief Handle a peer's reply to a fencing request
2330  *
2331  * Parse a fencing reply from XML, and either finalize the operation
2332  * or attempt another device as appropriate.
2333  *
2334  * \param[in] msg  XML reply received
2335  */
2336 void
2337 fenced_process_fencing_reply(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
2338 {
2339     const char *id = NULL;
2340     const char *device = NULL;
2341     remote_fencing_op_t *op = NULL;
2342     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2343     pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2344 
2345     CRM_CHECK(dev != NULL, return);
2346 
2347     id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2348     CRM_CHECK(id != NULL, return);
2349 
2350     dev = stonith__find_xe_with_result(msg);
2351     CRM_CHECK(dev != NULL, return);
2352 
2353     stonith__xe_get_result(dev, &result);
2354 
2355     device = crm_element_value(dev, F_STONITH_DEVICE);
2356 
2357     if (stonith_remote_op_list) {
2358         op = g_hash_table_lookup(stonith_remote_op_list, id);
2359     }
2360 
2361     if ((op == NULL) && pcmk__result_ok(&result)) {
2362         /* Record successful fencing operations */
2363         const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2364 
2365         op = create_remote_stonith_op(client_id, dev, TRUE);
2366     }
2367 
2368     if (op == NULL) {
2369         /* Could be for an event that began before we started */
2370         /* TODO: Record the op for later querying */
2371         crm_info("Received peer result of unknown or expired operation %s", id);
2372         pcmk__reset_result(&result);
2373         return;
2374     }
2375 
2376     pcmk__reset_result(&op->result);
2377     op->result = result; // The operation takes ownership of the result
2378 
2379     if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2380         crm_err("Received outdated reply for device %s (instead of %s) to "
2381                 "fence (%s) %s. Operation already timed out at peer level.",
2382                 device, (const char *) op->devices->data, op->action, op->target);
2383         return;
2384     }
2385 
2386     if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2387         if (pcmk__result_ok(&op->result)) {
2388             op->state = st_done;
2389         } else {
2390             op->state = st_failed;
2391         }
2392         finalize_op(op, msg, false);
2393         return;
2394 
2395     } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2396         /* If this isn't a remote level broadcast, and we are not the
2397          * originator of the operation, we should not be receiving this msg. */
2398         crm_err("Received non-broadcast fencing result for operation %.8s "
2399                 "we do not own (device %s targeting %s)",
2400                 op->id, device, op->target);
2401         return;
2402     }
2403 
2404     if (pcmk_is_set(op->call_options, st_opt_topology)) {
2405         const char *device = NULL;
2406         const char *reason = op->result.exit_reason;
2407 
2408         /* We own the op, and it is complete. broadcast the result to all nodes
2409          * and notify our local clients. */
2410         if (op->state == st_done) {
2411             finalize_op(op, msg, false);
2412             return;
2413         }
2414 
2415         device = crm_element_value(msg, F_STONITH_DEVICE);
2416 
2417         if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2418             /* A remapped "on" failed, but the node was already turned off
2419              * successfully, so ignore the error and continue.
2420              */
2421             crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2422                      "after successful 'off'",
2423                      device, pcmk_exec_status_str(op->result.execution_status),
2424                      (reason == NULL)? "" : ": ",
2425                      (reason == NULL)? "" : reason,
2426                      op->target);
2427             pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2428         } else {
2429             crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2430                        "%s%s%s%s",
2431                        op->action, op->target,
2432                        ((device == NULL)? "" : " using "),
2433                        ((device == NULL)? "" : device),
2434                        op->client_name,
2435                        op->originator,
2436                        pcmk_exec_status_str(op->result.execution_status),
2437                        (reason == NULL)? "" : " (",
2438                        (reason == NULL)? "" : reason,
2439                        (reason == NULL)? "" : ")");
2440         }
2441 
2442         if (pcmk__result_ok(&op->result)) {
2443             /* An operation completed successfully. Try another device if
2444              * necessary, otherwise mark the operation as done. */
2445             advance_topology_device_in_level(op, device, msg);
2446             return;
2447         } else {
2448             /* This device failed, time to try another topology level. If no other
2449              * levels are available, mark this operation as failed and report results. */
2450             if (advance_topology_level(op, false) != pcmk_rc_ok) {
2451                 op->state = st_failed;
2452                 finalize_op(op, msg, false);
2453                 return;
2454             }
2455         }
2456 
2457     } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2458         op->state = st_done;
2459         finalize_op(op, msg, false);
2460         return;
2461 
2462     } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2463                && (op->devices == NULL)) {
2464         /* If the operation timed out don't bother retrying other peers. */
2465         op->state = st_failed;
2466         finalize_op(op, msg, false);
2467         return;
2468 
2469     } else {
2470         /* fall-through and attempt other fencing action using another peer */
2471     }
2472 
2473     /* Retry on failure */
2474     crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2475               op->target, op->originator, op->client_name,
2476               pcmk_exec_status_str(op->result.execution_status));
2477     request_peer_fencing(op, NULL);
2478 }
2479 
2480 gboolean
2481 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
2482 {
2483     GHashTableIter iter;
2484     time_t now = time(NULL);
2485     remote_fencing_op_t *rop = NULL;
2486 
2487     if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2488         action == NULL) {
2489         return FALSE;
2490     }
2491 
2492     g_hash_table_iter_init(&iter, stonith_remote_op_list);
2493     while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2494         if (strcmp(rop->target, target) != 0) {
2495             continue;
2496         } else if (rop->state != st_done) {
2497             continue;
2498         /* We don't have to worry about remapped reboots here
2499          * because if state is done, any remapping has been undone
2500          */
2501         } else if (strcmp(rop->action, action) != 0) {
2502             continue;
2503         } else if ((rop->completed + tolerance) < now) {
2504             continue;
2505         }
2506 
2507         crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2508                    target, action, tolerance, rop->delegate, rop->originator);
2509         return TRUE;
2510     }
2511     return FALSE;
2512 }

/* [previous][next][first][last][top][bottom][index][help] */