root/daemons/fenced/fenced_remote.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sort_strings
  2. free_remote_query
  3. free_stonith_remote_op_list
  4. count_peer_device
  5. count_peer_devices
  6. find_peer_device
  7. grab_peer_device
  8. clear_remote_op_timers
  9. free_remote_op
  10. init_stonith_remote_op_hash_table
  11. op_requested_action
  12. op_phase_off
  13. op_phase_on
  14. undo_op_remap
  15. fencing_result2xml
  16. fenced_broadcast_op_result
  17. handle_local_reply_and_notify
  18. finalize_op_duplicates
  19. delegate_from_xml
  20. finalize_op
  21. remote_op_watchdog_done
  22. remote_op_timeout_one
  23. finalize_timed_out_op
  24. remote_op_timeout
  25. remote_op_query_timeout
  26. topology_is_empty
  27. add_required_device
  28. remove_required_device
  29. set_op_device_list
  30. topology_matches
  31. find_topology_for_host
  32. advance_topology_level
  33. merge_duplicates
  34. fencing_active_peers
  35. fenced_handle_manual_confirmation
  36. create_remote_stonith_op
  37. initiate_remote_stonith_op
  38. find_best_peer
  39. stonith_choose_peer
  40. get_device_timeout
  41. add_device_timeout
  42. get_peer_timeout
  43. get_op_total_timeout
  44. report_timeout_period
  45. advance_topology_device_in_level
  46. check_watchdog_fencing_and_wait
  47. request_peer_fencing
  48. sort_peers
  49. all_topology_devices_found
  50. parse_action_specific
  51. add_device_properties
  52. add_result
  53. process_remote_stonith_query
  54. fenced_process_fencing_reply
  55. stonith_check_fence_tolerance

   1 /*
   2  * Copyright 2009-2022 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/param.h>
  13 #include <stdio.h>
  14 #include <sys/types.h>
  15 #include <sys/wait.h>
  16 #include <sys/stat.h>
  17 #include <unistd.h>
  18 #include <sys/utsname.h>
  19 
  20 #include <stdlib.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <ctype.h>
  24 #include <regex.h>
  25 
  26 #include <crm/crm.h>
  27 #include <crm/msg_xml.h>
  28 #include <crm/common/ipc.h>
  29 #include <crm/common/ipc_internal.h>
  30 #include <crm/cluster/internal.h>
  31 
  32 #include <crm/stonith-ng.h>
  33 #include <crm/fencing/internal.h>
  34 #include <crm/common/xml.h>
  35 #include <crm/common/xml_internal.h>
  36 
  37 #include <crm/common/util.h>
  38 #include <pacemaker-fenced.h>
  39 
  40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
  41 
  42 /* When one fencer queries its peers for devices able to handle a fencing
  43  * request, each peer will reply with a list of such devices available to it.
  44  * Each reply will be parsed into a peer_device_info_t, with each device's
  45  * information kept in a device_properties_t.
  46  */
  47 
  48 typedef struct device_properties_s {
  49     /* Whether access to this device has been verified */
  50     gboolean verified;
  51 
  52     /* The remaining members are indexed by the operation's "phase" */
  53 
  54     /* Whether this device has been executed in each phase */
  55     gboolean executed[st_phase_max];
  56     /* Whether this device is disallowed from executing in each phase */
  57     gboolean disallowed[st_phase_max];
  58     /* Action-specific timeout for each phase */
  59     int custom_action_timeout[st_phase_max];
  60     /* Action-specific maximum random delay for each phase */
  61     int delay_max[st_phase_max];
  62     /* Action-specific base delay for each phase */
  63     int delay_base[st_phase_max];
  64     /* Group of enum st_device_flags */
  65     uint32_t device_support_flags;
  66 } device_properties_t;
  67 
  68 typedef struct {
  69     /* Name of peer that sent this result */
  70     char *host;
  71     /* Only try peers for non-topology based operations once */
  72     gboolean tried;
  73     /* Number of entries in the devices table */
  74     int ndevices;
  75     /* Devices available to this host that are capable of fencing the target */
  76     GHashTable *devices;
  77 } peer_device_info_t;
  78 
  79 GHashTable *stonith_remote_op_list = NULL;
  80 
  81 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
  82                                   int call_options);
  83 
  84 static void request_peer_fencing(remote_fencing_op_t *op,
  85                                  peer_device_info_t *peer);
  86 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
  87 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
  88 static int get_op_total_timeout(const remote_fencing_op_t *op,
  89                                 const peer_device_info_t *chosen_peer);
  90 
  91 static gint
  92 sort_strings(gconstpointer a, gconstpointer b)
     /* [previous][next][first][last][top][bottom][index][help] */
  93 {
  94     return strcmp(a, b);
  95 }
  96 
  97 static void
  98 free_remote_query(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
  99 {
 100     if (data != NULL) {
 101         peer_device_info_t *peer = data;
 102 
 103         g_hash_table_destroy(peer->devices);
 104         free(peer->host);
 105         free(peer);
 106     }
 107 }
 108 
 109 void
 110 free_stonith_remote_op_list(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 111 {
 112     if (stonith_remote_op_list != NULL) {
 113         g_hash_table_destroy(stonith_remote_op_list);
 114         stonith_remote_op_list = NULL;
 115     }
 116 }
 117 
 118 struct peer_count_data {
 119     const remote_fencing_op_t *op;
 120     gboolean verified_only;
 121     uint32_t support_action_only;
 122     int count;
 123 };
 124 
 125 /*!
 126  * \internal
 127  * \brief Increment a counter if a device has not been executed yet
 128  *
 129  * \param[in]     key        Device ID (ignored)
 130  * \param[in]     value      Device properties
 131  * \param[in,out] user_data  Peer count data
 132  */
 133 static void
 134 count_peer_device(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 135 {
 136     device_properties_t *props = (device_properties_t*)value;
 137     struct peer_count_data *data = user_data;
 138 
 139     if (!props->executed[data->op->phase]
 140         && (!data->verified_only || props->verified)
 141         && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
 142         ++(data->count);
 143     }
 144 }
 145 
 146 /*!
 147  * \internal
 148  * \brief Check the number of available devices in a peer's query results
 149  *
 150  * \param[in] op             Operation that results are for
 151  * \param[in] peer           Peer to count
 152  * \param[in] verified_only  Whether to count only verified devices
 153  * \param[in] support_action_only Whether to count only devices that support action
 154  *
 155  * \return Number of devices available to peer that were not already executed
 156  */
 157 static int
 158 count_peer_devices(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
 159                    const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
 160 {
 161     struct peer_count_data data;
 162 
 163     data.op = op;
 164     data.verified_only = verified_only;
 165     data.support_action_only = support_on_action_only;
 166     data.count = 0;
 167     if (peer) {
 168         g_hash_table_foreach(peer->devices, count_peer_device, &data);
 169     }
 170     return data.count;
 171 }
 172 
 173 /*!
 174  * \internal
 175  * \brief Search for a device in a query result
 176  *
 177  * \param[in] op      Operation that result is for
 178  * \param[in] peer    Query result for a peer
 179  * \param[in] device  Device ID to search for
 180  *
 181  * \return Device properties if found, NULL otherwise
 182  */
 183 static device_properties_t *
 184 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
     /* [previous][next][first][last][top][bottom][index][help] */
 185                  const char *device, uint32_t support_action_only)
 186 {
 187     device_properties_t *props = g_hash_table_lookup(peer->devices, device);
 188 
 189     if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
 190         return NULL;
 191     }
 192     return (props && !props->executed[op->phase]
 193            && !props->disallowed[op->phase])? props : NULL;
 194 }
 195 
 196 /*!
 197  * \internal
 198  * \brief Find a device in a peer's device list and mark it as executed
 199  *
 200  * \param[in]     op                     Operation that peer result is for
 201  * \param[in,out] peer                   Peer with results to search
 202  * \param[in]     device                 ID of device to mark as done
 203  * \param[in]     verified_devices_only  Only consider verified devices
 204  *
 205  * \return TRUE if device was found and marked, FALSE otherwise
 206  */
 207 static gboolean
 208 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
     /* [previous][next][first][last][top][bottom][index][help] */
 209                  const char *device, gboolean verified_devices_only)
 210 {
 211     device_properties_t *props = find_peer_device(op, peer, device,
 212                                                   fenced_support_flag(op->action));
 213 
 214     if ((props == NULL) || (verified_devices_only && !props->verified)) {
 215         return FALSE;
 216     }
 217 
 218     crm_trace("Removing %s from %s (%d remaining)",
 219               device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
 220     props->executed[op->phase] = TRUE;
 221     return TRUE;
 222 }
 223 
 224 static void
 225 clear_remote_op_timers(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 226 {
 227     if (op->query_timer) {
 228         g_source_remove(op->query_timer);
 229         op->query_timer = 0;
 230     }
 231     if (op->op_timer_total) {
 232         g_source_remove(op->op_timer_total);
 233         op->op_timer_total = 0;
 234     }
 235     if (op->op_timer_one) {
 236         g_source_remove(op->op_timer_one);
 237         op->op_timer_one = 0;
 238     }
 239 }
 240 
 241 static void
 242 free_remote_op(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 243 {
 244     remote_fencing_op_t *op = data;
 245 
 246     crm_log_xml_debug(op->request, "Destroying");
 247 
 248     clear_remote_op_timers(op);
 249 
 250     free(op->id);
 251     free(op->action);
 252     free(op->delegate);
 253     free(op->target);
 254     free(op->client_id);
 255     free(op->client_name);
 256     free(op->originator);
 257 
 258     if (op->query_results) {
 259         g_list_free_full(op->query_results, free_remote_query);
 260     }
 261     if (op->request) {
 262         free_xml(op->request);
 263         op->request = NULL;
 264     }
 265     if (op->devices_list) {
 266         g_list_free_full(op->devices_list, free);
 267         op->devices_list = NULL;
 268     }
 269     g_list_free_full(op->automatic_list, free);
 270     g_list_free(op->duplicates);
 271 
 272     pcmk__reset_result(&op->result);
 273     free(op);
 274 }
 275 
 276 void
 277 init_stonith_remote_op_hash_table(GHashTable **table)
     /* [previous][next][first][last][top][bottom][index][help] */
 278 {
 279     if (*table == NULL) {
 280         *table = pcmk__strkey_table(NULL, free_remote_op);
 281     }
 282 }
 283 
 284 /*!
 285  * \internal
 286  * \brief Return an operation's originally requested action (before any remap)
 287  *
 288  * \param[in] op  Operation to check
 289  *
 290  * \return Operation's original action
 291  */
 292 static const char *
 293 op_requested_action(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 294 {
 295     return ((op->phase > st_phase_requested)? "reboot" : op->action);
 296 }
 297 
 298 /*!
 299  * \internal
 300  * \brief Remap a "reboot" operation to the "off" phase
 301  *
 302  * \param[in,out] op      Operation to remap
 303  */
 304 static void
 305 op_phase_off(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 306 {
 307     crm_info("Remapping multiple-device reboot targeting %s to 'off' "
 308              CRM_XS " id=%.8s", op->target, op->id);
 309     op->phase = st_phase_off;
 310 
 311     /* Happily, "off" and "on" are shorter than "reboot", so we can reuse the
 312      * memory allocation at each phase.
 313      */
 314     strcpy(op->action, "off");
 315 }
 316 
 317 /*!
 318  * \internal
 319  * \brief Advance a remapped reboot operation to the "on" phase
 320  *
 321  * \param[in,out] op  Operation to remap
 322  */
 323 static void
 324 op_phase_on(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 325 {
 326     GList *iter = NULL;
 327 
 328     crm_info("Remapped 'off' targeting %s complete, "
 329              "remapping to 'on' for %s " CRM_XS " id=%.8s",
 330              op->target, op->client_name, op->id);
 331     op->phase = st_phase_on;
 332     strcpy(op->action, "on");
 333 
 334     /* Skip devices with automatic unfencing, because the cluster will handle it
 335      * when the node rejoins.
 336      */
 337     for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
 338         GList *match = g_list_find_custom(op->devices_list, iter->data,
 339                                             sort_strings);
 340 
 341         if (match) {
 342             op->devices_list = g_list_remove(op->devices_list, match->data);
 343         }
 344     }
 345     g_list_free_full(op->automatic_list, free);
 346     op->automatic_list = NULL;
 347 
 348     /* Rewind device list pointer */
 349     op->devices = op->devices_list;
 350 }
 351 
 352 /*!
 353  * \internal
 354  * \brief Reset a remapped reboot operation
 355  *
 356  * \param[in,out] op  Operation to reset
 357  */
 358 static void
 359 undo_op_remap(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 360 {
 361     if (op->phase > 0) {
 362         crm_info("Undoing remap of reboot targeting %s for %s "
 363                  CRM_XS " id=%.8s", op->target, op->client_name, op->id);
 364         op->phase = st_phase_requested;
 365         strcpy(op->action, "reboot");
 366     }
 367 }
 368 
 369 /*!
 370  * \internal
 371  * \brief Create notification data XML for a fencing operation result
 372  *
 373  * \param[in] op      Fencer operation that completed
 374  *
 375  * \return Newly created XML to add as notification data
 376  * \note The caller is responsible for freeing the result.
 377  */
 378 static xmlNode *
 379 fencing_result2xml(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 380 {
 381     xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
 382 
 383     crm_xml_add_int(notify_data, "state", op->state);
 384     crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
 385     crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
 386     crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
 387     crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
 388     crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
 389     crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
 390     crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
 391 
 392     return notify_data;
 393 }
 394 
 395 /*!
 396  * \internal
 397  * \brief Broadcast a fence result notification to all CPG peers
 398  *
 399  * \param[in] op         Fencer operation that completed
 400  * \param[in] op_merged  Whether this operation is a duplicate of another
 401  */
 402 void
 403 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
     /* [previous][next][first][last][top][bottom][index][help] */
 404 {
 405     static int count = 0;
 406     xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
 407     xmlNode *notify_data = fencing_result2xml(op);
 408 
 409     count++;
 410     crm_trace("Broadcasting result to peers");
 411     crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
 412     crm_xml_add(bcast, F_SUBTYPE, "broadcast");
 413     crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
 414     crm_xml_add_int(bcast, "count", count);
 415 
 416     if (op_merged) {
 417         pcmk__xe_set_bool_attr(bcast, F_STONITH_MERGED, true);
 418     }
 419 
 420     stonith__xe_set_result(notify_data, &op->result);
 421 
 422     add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
 423     send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
 424     free_xml(notify_data);
 425     free_xml(bcast);
 426 
 427     return;
 428 }
 429 
 430 /*!
 431  * \internal
 432  * \brief Reply to a local request originator and notify all subscribed clients
 433  *
 434  * \param[in,out] op    Fencer operation that completed
 435  * \param[in,out] data  Top-level XML to add notification to
 436  */
 437 static void
 438 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 439 {
 440     xmlNode *notify_data = NULL;
 441     xmlNode *reply = NULL;
 442     pcmk__client_t *client = NULL;
 443 
 444     if (op->notify_sent == TRUE) {
 445         /* nothing to do */
 446         return;
 447     }
 448 
 449     /* Do notification with a clean data object */
 450     crm_xml_add_int(data, "state", op->state);
 451     crm_xml_add(data, F_STONITH_TARGET, op->target);
 452     crm_xml_add(data, F_STONITH_OPERATION, op->action);
 453 
 454     reply = fenced_construct_reply(op->request, data, &op->result);
 455     crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
 456 
 457     /* Send fencing OP reply to local client that initiated fencing */
 458     client = pcmk__find_client_by_id(op->client_id);
 459     if (client == NULL) {
 460         crm_trace("Skipping reply to %s: no longer a client", op->client_id);
 461     } else {
 462         do_local_reply(reply, client, op->call_options);
 463     }
 464 
 465     /* bcast to all local clients that the fencing operation happend */
 466     notify_data = fencing_result2xml(op);
 467     fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data);
 468     free_xml(notify_data);
 469     fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
 470 
 471     /* mark this op as having notify's already sent */
 472     op->notify_sent = TRUE;
 473     free_xml(reply);
 474 }
 475 
 476 /*!
 477  * \internal
 478  * \brief Finalize all duplicates of a given fencer operation
 479  *
 480  * \param[in,out] op    Fencer operation that completed
 481  * \param[in,out] data  Top-level XML to add notification to
 482  */
 483 static void
 484 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 485 {
 486     for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
 487         remote_fencing_op_t *other = iter->data;
 488 
 489         if (other->state == st_duplicate) {
 490             other->state = op->state;
 491             crm_debug("Performing duplicate notification for %s@%s: %s "
 492                       CRM_XS " id=%.8s",
 493                       other->client_name, other->originator,
 494                       pcmk_exec_status_str(op->result.execution_status),
 495                       other->id);
 496             pcmk__copy_result(&op->result, &other->result);
 497             finalize_op(other, data, true);
 498 
 499         } else {
 500             // Possible if (for example) it timed out already
 501             crm_err("Skipping duplicate notification for %s@%s "
 502                     CRM_XS " state=%s id=%.8s",
 503                     other->client_name, other->originator,
 504                     stonith_op_state_str(other->state), other->id);
 505         }
 506     }
 507 }
 508 
 509 static char *
 510 delegate_from_xml(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
 511 {
 512     xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
 513 
 514     if (match == NULL) {
 515         return crm_element_value_copy(xml, F_ORIG);
 516     } else {
 517         return crm_element_value_copy(match, F_STONITH_DELEGATE);
 518     }
 519 }
 520 
 521 /*!
 522  * \internal
 523  * \brief Finalize a peer fencing operation
 524  *
 525  * Clean up after a fencing operation completes. This function has two code
 526  * paths: the executioner uses it to broadcast the result to CPG peers, and then
 527  * each peer (including the executioner) uses it to process that broadcast and
 528  * notify its IPC clients of the result.
 529  *
 530  * \param[in,out] op      Fencer operation that completed
 531  * \param[in,out] data    If not NULL, XML reply of last delegated operation
 532  * \param[in]     dup     Whether this operation is a duplicate of another
 533  *                        (in which case, do not broadcast the result)
 534  *
 535  *  \note The operation result should be set before calling this function.
 536  */
 537 static void
 538 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
     /* [previous][next][first][last][top][bottom][index][help] */
 539 {
 540     int level = LOG_ERR;
 541     const char *subt = NULL;
 542     xmlNode *local_data = NULL;
 543     gboolean op_merged = FALSE;
 544 
 545     CRM_CHECK((op != NULL), return);
 546 
 547     if (op->notify_sent) {
 548         // Most likely, this is a timed-out action that eventually completed
 549         crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
 550                    "Result arrived too late " CRM_XS " id=%.8s",
 551                    op->action, (op->target? " targeting " : ""),
 552                    (op->target? op->target : ""),
 553                    (op->delegate? op->delegate : "unknown node"),
 554                    op->client_name, op->originator,
 555                    (op_merged? " (merged)" : ""),
 556                    op->id);
 557         return;
 558     }
 559 
 560     set_fencing_completed(op);
 561     clear_remote_op_timers(op);
 562     undo_op_remap(op);
 563 
 564     if (data == NULL) {
 565         data = create_xml_node(NULL, "remote-op");
 566         local_data = data;
 567 
 568     } else if (op->delegate == NULL) {
 569         switch (op->result.execution_status) {
 570             case PCMK_EXEC_NO_FENCE_DEVICE:
 571                 break;
 572 
 573             case PCMK_EXEC_INVALID:
 574                 if (op->result.exit_status != CRM_EX_EXPIRED) {
 575                     op->delegate = delegate_from_xml(data);
 576                 }
 577                 break;
 578 
 579             default:
 580                 op->delegate = delegate_from_xml(data);
 581                 break;
 582         }
 583     }
 584 
 585     if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) {
 586         op_merged = true;
 587     }
 588 
 589     /* Tell everyone the operation is done, we will continue
 590      * with doing the local notifications once we receive
 591      * the broadcast back. */
 592     subt = crm_element_value(data, F_SUBTYPE);
 593     if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
 594         /* Defer notification until the bcast message arrives */
 595         fenced_broadcast_op_result(op, op_merged);
 596         free_xml(local_data);
 597         return;
 598     }
 599 
 600     if (pcmk__result_ok(&op->result) || dup
 601         || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
 602         level = LOG_NOTICE;
 603     }
 604     do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
 605                CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
 606                (op->target? op->target : ""),
 607                (op->delegate? op->delegate : "unknown node"),
 608                op->client_name, op->originator,
 609                (op_merged? " (merged)" : ""),
 610                crm_exit_str(op->result.exit_status),
 611                pcmk_exec_status_str(op->result.execution_status),
 612                ((op->result.exit_reason == NULL)? "" : ": "),
 613                ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
 614                op->id);
 615 
 616     handle_local_reply_and_notify(op, data);
 617 
 618     if (!dup) {
 619         finalize_op_duplicates(op, data);
 620     }
 621 
 622     /* Free non-essential parts of the record
 623      * Keep the record around so we can query the history
 624      */
 625     if (op->query_results) {
 626         g_list_free_full(op->query_results, free_remote_query);
 627         op->query_results = NULL;
 628     }
 629     if (op->request) {
 630         free_xml(op->request);
 631         op->request = NULL;
 632     }
 633 
 634     free_xml(local_data);
 635 }
 636 
 637 /*!
 638  * \internal
 639  * \brief Finalize a watchdog fencer op after the waiting time expires
 640  *
 641  * \param[in,out] userdata  Fencer operation that completed
 642  *
 643  * \return G_SOURCE_REMOVE (which tells glib not to restart timer)
 644  */
 645 static gboolean
 646 remote_op_watchdog_done(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 647 {
 648     remote_fencing_op_t *op = userdata;
 649 
 650     op->op_timer_one = 0;
 651 
 652     crm_notice("Self-fencing (%s) by %s for %s assumed complete "
 653                CRM_XS " id=%.8s",
 654                op->action, op->target, op->client_name, op->id);
 655     op->state = st_done;
 656     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
 657     finalize_op(op, NULL, false);
 658     return G_SOURCE_REMOVE;
 659 }
 660 
 661 static gboolean
 662 remote_op_timeout_one(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 663 {
 664     remote_fencing_op_t *op = userdata;
 665 
 666     op->op_timer_one = 0;
 667 
 668     crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
 669                " id=%.8s", op->action, op->target, op->client_name, op->id);
 670     pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
 671                      "Peer did not return fence result within timeout");
 672 
 673     // Try another device, if appropriate
 674     request_peer_fencing(op, NULL);
 675     return FALSE;
 676 }
 677 
 678 /*!
 679  * \internal
 680  * \brief Finalize a remote fencer operation that timed out
 681  *
 682  * \param[in,out] op      Fencer operation that timed out
 683  * \param[in]     reason  Readable description of what step timed out
 684  */
 685 static void
 686 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
     /* [previous][next][first][last][top][bottom][index][help] */
 687 {
 688     op->op_timer_total = 0;
 689 
 690     crm_debug("Action '%s' targeting %s for client %s timed out "
 691               CRM_XS " id=%.8s",
 692               op->action, op->target, op->client_name, op->id);
 693 
 694     if (op->phase == st_phase_on) {
 695         /* A remapped reboot operation timed out in the "on" phase, but the
 696          * "off" phase completed successfully, so quit trying any further
 697          * devices, and return success.
 698          */
 699         op->state = st_done;
 700         pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
 701     } else {
 702         op->state = st_failed;
 703         pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
 704     }
 705     finalize_op(op, NULL, false);
 706 }
 707 
 708 /*!
 709  * \internal
 710  * \brief Finalize a remote fencer operation that timed out
 711  *
 712  * \param[in,out] userdata  Fencer operation that timed out
 713  *
 714  * \return G_SOURCE_REMOVE (which tells glib not to restart timer)
 715  */
 716 static gboolean
 717 remote_op_timeout(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 718 {
 719     remote_fencing_op_t *op = userdata;
 720 
 721     if (op->state == st_done) {
 722         crm_debug("Action '%s' targeting %s for client %s already completed "
 723                   CRM_XS " id=%.8s",
 724                   op->action, op->target, op->client_name, op->id);
 725     } else {
 726         finalize_timed_out_op(userdata, "Fencing did not complete within a "
 727                                         "total timeout based on the "
 728                                         "configured timeout and retries for "
 729                                         "any devices attempted");
 730     }
 731     return G_SOURCE_REMOVE;
 732 }
 733 
 734 static gboolean
 735 remote_op_query_timeout(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 736 {
 737     remote_fencing_op_t *op = data;
 738 
 739     op->query_timer = 0;
 740     if (op->state == st_done) {
 741         crm_debug("Operation %.8s targeting %s already completed",
 742                   op->id, op->target);
 743     } else if (op->state == st_exec) {
 744         crm_debug("Operation %.8s targeting %s already in progress",
 745                   op->id, op->target);
 746     } else if (op->query_results) {
 747         // Query succeeded, so attempt the actual fencing
 748         crm_debug("Query %.8s targeting %s complete (state=%s)",
 749                   op->id, op->target, stonith_op_state_str(op->state));
 750         request_peer_fencing(op, NULL);
 751     } else {
 752         crm_debug("Query %.8s targeting %s timed out (state=%s)",
 753                   op->id, op->target, stonith_op_state_str(op->state));
 754         if (op->op_timer_total) {
 755             g_source_remove(op->op_timer_total);
 756             op->op_timer_total = 0;
 757         }
 758         finalize_timed_out_op(op, "No capable peers replied to device query "
 759                                   "within timeout");
 760     }
 761 
 762     return FALSE;
 763 }
 764 
 765 static gboolean
 766 topology_is_empty(stonith_topology_t *tp)
     /* [previous][next][first][last][top][bottom][index][help] */
 767 {
 768     int i;
 769 
 770     if (tp == NULL) {
 771         return TRUE;
 772     }
 773 
 774     for (i = 0; i < ST_LEVEL_MAX; i++) {
 775         if (tp->levels[i] != NULL) {
 776             return FALSE;
 777         }
 778     }
 779     return TRUE;
 780 }
 781 
 782 /*!
 783  * \internal
 784  * \brief Add a device to an operation's automatic unfencing list
 785  *
 786  * \param[in,out] op      Operation to modify
 787  * \param[in]     device  Device ID to add
 788  */
 789 static void
 790 add_required_device(remote_fencing_op_t *op, const char *device)
     /* [previous][next][first][last][top][bottom][index][help] */
 791 {
 792     GList *match  = g_list_find_custom(op->automatic_list, device,
 793                                          sort_strings);
 794 
 795     if (!match) {
 796         op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
 797     }
 798 }
 799 
 800 /*!
 801  * \internal
 802  * \brief Remove a device from the automatic unfencing list
 803  *
 804  * \param[in,out] op      Operation to modify
 805  * \param[in]     device  Device ID to remove
 806  */
 807 static void
 808 remove_required_device(remote_fencing_op_t *op, const char *device)
     /* [previous][next][first][last][top][bottom][index][help] */
 809 {
 810     GList *match = g_list_find_custom(op->automatic_list, device,
 811                                         sort_strings);
 812 
 813     if (match) {
 814         op->automatic_list = g_list_remove(op->automatic_list, match->data);
 815     }
 816 }
 817 
 818 /* deep copy the device list */
 819 static void
 820 set_op_device_list(remote_fencing_op_t * op, GList *devices)
     /* [previous][next][first][last][top][bottom][index][help] */
 821 {
 822     GList *lpc = NULL;
 823 
 824     if (op->devices_list) {
 825         g_list_free_full(op->devices_list, free);
 826         op->devices_list = NULL;
 827     }
 828     for (lpc = devices; lpc != NULL; lpc = lpc->next) {
 829         op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
 830     }
 831     op->devices = op->devices_list;
 832 }
 833 
 834 /*!
 835  * \internal
 836  * \brief Check whether a node matches a topology target
 837  *
 838  * \param[in] tp    Topology table entry to check
 839  * \param[in] node  Name of node to check
 840  *
 841  * \return TRUE if node matches topology target
 842  */
 843 static gboolean
 844 topology_matches(const stonith_topology_t *tp, const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 845 {
 846     regex_t r_patt;
 847 
 848     CRM_CHECK(node && tp && tp->target, return FALSE);
 849     switch (tp->kind) {
 850         case fenced_target_by_attribute:
 851             /* This level targets by attribute, so tp->target is a NAME=VALUE pair
 852              * of a permanent attribute applied to targeted nodes. The test below
 853              * relies on the locally cached copy of the CIB, so if fencing needs to
 854              * be done before the initial CIB is received or after a malformed CIB
 855              * is received, then the topology will be unable to be used.
 856              */
 857             if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
 858                 crm_notice("Matched %s with %s by attribute", node, tp->target);
 859                 return TRUE;
 860             }
 861             break;
 862 
 863         case fenced_target_by_pattern:
 864             /* This level targets node names matching a pattern, so tp->target
 865              * (and tp->target_pattern) is a regular expression.
 866              */
 867             if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
 868                 crm_info("Bad regex '%s' for fencing level", tp->target);
 869             } else {
 870                 int status = regexec(&r_patt, node, 0, NULL, 0);
 871 
 872                 regfree(&r_patt);
 873                 if (status == 0) {
 874                     crm_notice("Matched %s with %s by name", node, tp->target);
 875                     return TRUE;
 876                 }
 877             }
 878             break;
 879 
 880         case fenced_target_by_name:
 881             crm_trace("Testing %s against %s", node, tp->target);
 882             return pcmk__str_eq(tp->target, node, pcmk__str_casei);
 883 
 884         default:
 885             break;
 886     }
 887     crm_trace("No match for %s with %s", node, tp->target);
 888     return FALSE;
 889 }
 890 
 891 stonith_topology_t *
 892 find_topology_for_host(const char *host) 
     /* [previous][next][first][last][top][bottom][index][help] */
 893 {
 894     GHashTableIter tIter;
 895     stonith_topology_t *tp = g_hash_table_lookup(topology, host);
 896 
 897     if(tp != NULL) {
 898         crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
 899         return tp;
 900     }
 901 
 902     g_hash_table_iter_init(&tIter, topology);
 903     while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
 904         if (topology_matches(tp, host)) {
 905             crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
 906             return tp;
 907         }
 908     }
 909 
 910     crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
 911     return NULL;
 912 }
 913 
 914 /*!
 915  * \internal
 916  * \brief Set fencing operation's device list to target's next topology level
 917  *
 918  * \param[in,out] op        Remote fencing operation to modify
 919  * \param[in]     empty_ok  If true, an operation without a target (i.e.
 920  *                          queries) or a target without a topology will get a
 921  *                          pcmk_rc_ok return value instead of ENODEV
 922  *
 923  * \return Standard Pacemaker return value
 924  */
 925 static int
 926 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
     /* [previous][next][first][last][top][bottom][index][help] */
 927 {
 928     stonith_topology_t *tp = NULL;
 929 
 930     if (op->target) {
 931         tp = find_topology_for_host(op->target);
 932     }
 933     if (topology_is_empty(tp)) {
 934         return empty_ok? pcmk_rc_ok : ENODEV;
 935     }
 936 
 937     CRM_ASSERT(tp->levels != NULL);
 938 
 939     stonith__set_call_options(op->call_options, op->id, st_opt_topology);
 940 
 941     /* This is a new level, so undo any remapping left over from previous */
 942     undo_op_remap(op);
 943 
 944     do {
 945         op->level++;
 946 
 947     } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
 948 
 949     if (op->level < ST_LEVEL_MAX) {
 950         crm_trace("Attempting fencing level %d targeting %s (%d devices) "
 951                   "for client %s@%s (id=%.8s)",
 952                   op->level, op->target, g_list_length(tp->levels[op->level]),
 953                   op->client_name, op->originator, op->id);
 954         set_op_device_list(op, tp->levels[op->level]);
 955 
 956         // The requested delay has been applied for the first fencing level
 957         if (op->level > 1 && op->delay > 0) {
 958             op->delay = 0;
 959         }
 960 
 961         if ((g_list_next(op->devices_list) != NULL)
 962             && pcmk__str_eq(op->action, "reboot", pcmk__str_none)) {
 963             /* A reboot has been requested for a topology level with multiple
 964              * devices. Instead of rebooting the devices sequentially, we will
 965              * turn them all off, then turn them all on again. (Think about
 966              * switched power outlets for redundant power supplies.)
 967              */
 968             op_phase_off(op);
 969         }
 970         return pcmk_rc_ok;
 971     }
 972 
 973     crm_info("All fencing options targeting %s for client %s@%s failed "
 974              CRM_XS " id=%.8s",
 975              op->target, op->client_name, op->originator, op->id);
 976     return ENODEV;
 977 }
 978 
 979 /*!
 980  * \internal
 981  * \brief If fencing operation is a duplicate, merge it into the other one
 982  *
 983  * \param[in,out] op  Fencing operation to check
 984  */
 985 static void
 986 merge_duplicates(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 987 {
 988     GHashTableIter iter;
 989     remote_fencing_op_t *other = NULL;
 990 
 991     time_t now = time(NULL);
 992 
 993     g_hash_table_iter_init(&iter, stonith_remote_op_list);
 994     while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
 995         const char *other_action = op_requested_action(other);
 996 
 997         if (!strcmp(op->id, other->id)) {
 998             continue; // Don't compare against self
 999         }
1000         if (other->state > st_exec) {
1001             crm_trace("%.8s not duplicate of %.8s: not in progress",
1002                       op->id, other->id);
1003             continue;
1004         }
1005         if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1006             crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1007                       op->id, other->id, op->target, other->target);
1008             continue;
1009         }
1010         if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1011             crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1012                       op->id, other->id, op->action, other_action);
1013             continue;
1014         }
1015         if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1016             crm_trace("%.8s not duplicate of %.8s: same client %s",
1017                       op->id, other->id, op->client_name);
1018             continue;
1019         }
1020         if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1021             crm_trace("%.8s not duplicate of %.8s: suicide for %s",
1022                       op->id, other->id, other->target);
1023             continue;
1024         }
1025         if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
1026             crm_notice("Failing action '%s' targeting %s originating from "
1027                        "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
1028                        other->action, other->target, other->client_name,
1029                        other->originator, other->id);
1030             crm_trace("%.8s not duplicate of %.8s: originator dead",
1031                       op->id, other->id);
1032             other->state = st_failed;
1033             continue;
1034         }
1035         if ((other->total_timeout > 0)
1036             && (now > (other->total_timeout + other->created))) {
1037             crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
1038                       op->id, other->id, now, other->created,
1039                       other->total_timeout);
1040             continue;
1041         }
1042 
1043         /* There is another in-flight request to fence the same host
1044          * Piggyback on that instead.  If it fails, so do we.
1045          */
1046         other->duplicates = g_list_append(other->duplicates, op);
1047         if (other->total_timeout == 0) {
1048             other->total_timeout = op->total_timeout =
1049                 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1050             crm_trace("Best guess as to timeout used for %.8s: %d",
1051                       other->id, other->total_timeout);
1052         }
1053         crm_notice("Merging fencing action '%s' targeting %s originating from "
1054                    "client %s with identical request from %s@%s "
1055                    CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1056                    op->action, op->target, op->client_name,
1057                    other->client_name, other->originator,
1058                    op->id, other->id, other->total_timeout);
1059         report_timeout_period(op, other->total_timeout);
1060         op->state = st_duplicate;
1061     }
1062 }
1063 
1064 static uint32_t fencing_active_peers(void)
     /* [previous][next][first][last][top][bottom][index][help] */
1065 {
1066     uint32_t count = 0;
1067     crm_node_t *entry;
1068     GHashTableIter gIter;
1069 
1070     g_hash_table_iter_init(&gIter, crm_peer_cache);
1071     while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1072         if(fencing_peer_active(entry)) {
1073             count++;
1074         }
1075     }
1076     return count;
1077 }
1078 
1079 /*!
1080  * \internal
1081  * \brief Process a manual confirmation of a pending fence action
1082  *
1083  * \param[in]     client  IPC client that sent confirmation
1084  * \param[in,out] msg     Request XML with manual confirmation
1085  *
1086  * \return Standard Pacemaker return code
1087  */
1088 int
1089 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
1090 {
1091     remote_fencing_op_t *op = NULL;
1092     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
1093 
1094     CRM_CHECK(dev != NULL, return EPROTO);
1095 
1096     crm_notice("Received manual confirmation that %s has been fenced",
1097                pcmk__s(crm_element_value(dev, F_STONITH_TARGET),
1098                        "unknown target"));
1099     op = initiate_remote_stonith_op(client, msg, TRUE);
1100     if (op == NULL) {
1101         return EPROTO;
1102     }
1103     op->state = st_done;
1104     set_fencing_completed(op);
1105     op->delegate = strdup("a human");
1106 
1107     // For the fencer's purposes, the fencing operation is done
1108     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1109     finalize_op(op, msg, false);
1110 
1111     /* For the requester's purposes, the operation is still pending. The
1112      * actual result will be sent asynchronously via the operation's done_cb().
1113      */
1114     return EINPROGRESS;
1115 }
1116 
1117 /*!
1118  * \internal
1119  * \brief Create a new remote stonith operation
1120  *
1121  * \param[in] client   ID of local stonith client that initiated the operation
1122  * \param[in] request  The request from the client that started the operation
1123  * \param[in] peer     TRUE if this operation is owned by another stonith peer
1124  *                     (an operation owned by one peer is stored on all peers,
1125  *                     but only the owner executes it; all nodes get the results
1126  *                     once the owner finishes execution)
1127  */
1128 void *
1129 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1130 {
1131     remote_fencing_op_t *op = NULL;
1132     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
1133     int call_options = 0;
1134     const char *operation = NULL;
1135 
1136     init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1137 
1138     /* If this operation is owned by another node, check to make
1139      * sure we haven't already created this operation. */
1140     if (peer && dev) {
1141         const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1142 
1143         CRM_CHECK(op_id != NULL, return NULL);
1144 
1145         op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1146         if (op) {
1147             crm_debug("Reusing existing remote fencing op %.8s for %s",
1148                       op_id, ((client == NULL)? "unknown client" : client));
1149             return op;
1150         }
1151     }
1152 
1153     op = calloc(1, sizeof(remote_fencing_op_t));
1154     CRM_ASSERT(op != NULL);
1155 
1156     crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1157     // Value -1 means disable any static/random fencing delays
1158     crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
1159 
1160     if (peer && dev) {
1161         op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1162     } else {
1163         op->id = crm_generate_uuid();
1164     }
1165 
1166     g_hash_table_replace(stonith_remote_op_list, op->id, op);
1167 
1168     op->state = st_query;
1169     op->replies_expected = fencing_active_peers();
1170     op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1171     op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1172     op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE); /* May not be set */
1173     op->created = time(NULL);
1174 
1175     if (op->originator == NULL) {
1176         /* Local or relayed request */
1177         op->originator = strdup(stonith_our_uname);
1178     }
1179 
1180     CRM_LOG_ASSERT(client != NULL);
1181     if (client) {
1182         op->client_id = strdup(client);
1183     }
1184 
1185 
1186     /* For a RELAY operation, set fenced on the client. */
1187     operation = crm_element_value(request, F_STONITH_OPERATION);
1188 
1189     if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1190         op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1191                                          (unsigned long) getpid());
1192     } else {
1193         op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1194     }
1195 
1196     op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1197     op->request = copy_xml(request);    /* TODO: Figure out how to avoid this */
1198     crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1199     op->call_options = call_options;
1200 
1201     crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1202 
1203     crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1204               "base timeout %d, %u %s expected)",
1205               (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1206               op->target, op->client_name, op->base_timeout,
1207               op->replies_expected,
1208               pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1209 
1210     if (op->call_options & st_opt_cs_nodeid) {
1211         int nodeid;
1212         crm_node_t *node;
1213 
1214         pcmk__scan_min_int(op->target, &nodeid, 0);
1215         node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
1216 
1217         /* Ensure the conversion only happens once */
1218         stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1219 
1220         if (node && node->uname) {
1221             free(op->target);
1222             op->target = strdup(node->uname);
1223 
1224         } else {
1225             crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1226         }
1227     }
1228 
1229     /* check to see if this is a duplicate operation of another in-flight operation */
1230     merge_duplicates(op);
1231 
1232     if (op->state != st_duplicate) {
1233         /* kick history readers */
1234         fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
1235     }
1236 
1237     /* safe to trim as long as that doesn't touch pending ops */
1238     stonith_fence_history_trim();
1239 
1240     return op;
1241 }
1242 
1243 /*!
1244  * \internal
1245  * \brief Create a peer fencing operation from a request, and initiate it
1246  *
1247  * \param[in] client     IPC client that made request (NULL to get from request)
1248  * \param[in] request    Request XML
1249  * \param[in] manual_ack Whether this is a manual action confirmation
1250  *
1251  * \return Newly created operation on success, otherwise NULL
1252  */
1253 remote_fencing_op_t *
1254 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
     /* [previous][next][first][last][top][bottom][index][help] */
1255                            gboolean manual_ack)
1256 {
1257     int query_timeout = 0;
1258     xmlNode *query = NULL;
1259     const char *client_id = NULL;
1260     remote_fencing_op_t *op = NULL;
1261     const char *relay_op_id = NULL;
1262     const char *operation = NULL;
1263 
1264     if (client) {
1265         client_id = client->id;
1266     } else {
1267         client_id = crm_element_value(request, F_STONITH_CLIENTID);
1268     }
1269 
1270     CRM_LOG_ASSERT(client_id != NULL);
1271     op = create_remote_stonith_op(client_id, request, FALSE);
1272     op->owner = TRUE;
1273     if (manual_ack) {
1274         return op;
1275     }
1276 
1277     CRM_CHECK(op->action, return NULL);
1278 
1279     if (advance_topology_level(op, true) != pcmk_rc_ok) {
1280         op->state = st_failed;
1281     }
1282 
1283     switch (op->state) {
1284         case st_failed:
1285             // advance_topology_level() exhausted levels
1286             pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1287                              "All topology levels failed");
1288             crm_warn("Could not request peer fencing (%s) targeting %s "
1289                      CRM_XS " id=%.8s", op->action, op->target, op->id);
1290             finalize_op(op, NULL, false);
1291             return op;
1292 
1293         case st_duplicate:
1294             crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1295                      CRM_XS " id=%.8s", op->action, op->target, op->id);
1296             return op;
1297 
1298         default:
1299             crm_notice("Requesting peer fencing (%s) targeting %s "
1300                        CRM_XS " id=%.8s state=%s base_timeout=%d",
1301                        op->action, op->target, op->id,
1302                        stonith_op_state_str(op->state), op->base_timeout);
1303     }
1304 
1305     query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1306                               NULL, op->call_options);
1307 
1308     crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1309     crm_xml_add(query, F_STONITH_TARGET, op->target);
1310     crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1311     crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1312     crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1313     crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1314     crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1315 
1316     /* In case of RELAY operation, RELAY information is added to the query to delete the original operation of RELAY. */
1317     operation = crm_element_value(request, F_STONITH_OPERATION);
1318     if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1319         relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1320         if (relay_op_id) {
1321             crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1322         }
1323     }
1324 
1325     send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1326     free_xml(query);
1327 
1328     query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1329     op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1330 
1331     return op;
1332 }
1333 
1334 enum find_best_peer_options {
1335     /*! Skip checking the target peer for capable fencing devices */
1336     FIND_PEER_SKIP_TARGET = 0x0001,
1337     /*! Only check the target peer for capable fencing devices */
1338     FIND_PEER_TARGET_ONLY = 0x0002,
1339     /*! Skip peers and devices that are not verified */
1340     FIND_PEER_VERIFIED_ONLY = 0x0004,
1341 };
1342 
1343 static peer_device_info_t *
1344 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
     /* [previous][next][first][last][top][bottom][index][help] */
1345 {
1346     GList *iter = NULL;
1347     gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1348 
1349     if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1350         return NULL;
1351     }
1352 
1353     for (iter = op->query_results; iter != NULL; iter = iter->next) {
1354         peer_device_info_t *peer = iter->data;
1355 
1356         crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1357                   peer->host, op->target, peer->ndevices,
1358                   pcmk__plural_s(peer->ndevices), peer->tried, options);
1359         if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1360             continue;
1361         }
1362         if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1363             continue;
1364         }
1365 
1366         if (pcmk_is_set(op->call_options, st_opt_topology)) {
1367 
1368             if (grab_peer_device(op, peer, device, verified_devices_only)) {
1369                 return peer;
1370             }
1371 
1372         } else if (!peer->tried
1373                    && count_peer_devices(op, peer, verified_devices_only,
1374                                          fenced_support_flag(op->action))) {
1375             /* No topology: Use the current best peer */
1376             crm_trace("Simple fencing");
1377             return peer;
1378         }
1379     }
1380 
1381     return NULL;
1382 }
1383 
1384 static peer_device_info_t *
1385 stonith_choose_peer(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
1386 {
1387     const char *device = NULL;
1388     peer_device_info_t *peer = NULL;
1389     uint32_t active = fencing_active_peers();
1390 
1391     do {
1392         if (op->devices) {
1393             device = op->devices->data;
1394             crm_trace("Checking for someone to fence (%s) %s using %s",
1395                       op->action, op->target, device);
1396         } else {
1397             crm_trace("Checking for someone to fence (%s) %s",
1398                       op->action, op->target);
1399         }
1400 
1401         /* Best choice is a peer other than the target with verified access */
1402         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1403         if (peer) {
1404             crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1405             return peer;
1406         }
1407 
1408         if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1409             crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1410             return NULL;
1411         }
1412 
1413         /* If no other peer has verified access, next best is unverified access */
1414         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1415         if (peer) {
1416             crm_trace("Found best unverified peer %s", peer->host);
1417             return peer;
1418         }
1419 
1420         /* If no other peer can do it, last option is self-fencing
1421          * (which is never allowed for the "on" phase of a remapped reboot)
1422          */
1423         if (op->phase != st_phase_on) {
1424             peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1425             if (peer) {
1426                 crm_trace("%s will fence itself", peer->host);
1427                 return peer;
1428             }
1429         }
1430 
1431         /* Try the next fencing level if there is one (unless we're in the "on"
1432          * phase of a remapped "reboot", because we ignore errors in that case)
1433          */
1434     } while ((op->phase != st_phase_on)
1435              && pcmk_is_set(op->call_options, st_opt_topology)
1436              && (advance_topology_level(op, false) == pcmk_rc_ok));
1437 
1438     crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1439                op->action, op->target, (device? device : "any device"));
1440     return NULL;
1441 }
1442 
1443 static int
1444 get_device_timeout(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
1445                    const peer_device_info_t *peer, const char *device)
1446 {
1447     device_properties_t *props;
1448 
1449     if (!peer || !device) {
1450         return op->base_timeout;
1451     }
1452 
1453     props = g_hash_table_lookup(peer->devices, device);
1454     if (!props) {
1455         return op->base_timeout;
1456     }
1457 
1458     return (props->custom_action_timeout[op->phase]?
1459            props->custom_action_timeout[op->phase] : op->base_timeout)
1460            + props->delay_max[op->phase];
1461 }
1462 
1463 struct timeout_data {
1464     const remote_fencing_op_t *op;
1465     const peer_device_info_t *peer;
1466     int total_timeout;
1467 };
1468 
1469 /*!
1470  * \internal
1471  * \brief Add timeout to a total if device has not been executed yet
1472  *
1473  * \param[in]     key        GHashTable key (device ID)
1474  * \param[in]     value      GHashTable value (device properties)
1475  * \param[in,out] user_data  Timeout data
1476  */
1477 static void
1478 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1479 {
1480     const char *device_id = key;
1481     device_properties_t *props = value;
1482     struct timeout_data *timeout = user_data;
1483 
1484     if (!props->executed[timeout->op->phase]
1485         && !props->disallowed[timeout->op->phase]) {
1486         timeout->total_timeout += get_device_timeout(timeout->op,
1487                                                      timeout->peer, device_id);
1488     }
1489 }
1490 
1491 static int
1492 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1493 {
1494     struct timeout_data timeout;
1495 
1496     timeout.op = op;
1497     timeout.peer = peer;
1498     timeout.total_timeout = 0;
1499 
1500     g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1501 
1502     return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1503 }
1504 
1505 static int
1506 get_op_total_timeout(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
1507                      const peer_device_info_t *chosen_peer)
1508 {
1509     int total_timeout = 0;
1510     stonith_topology_t *tp = find_topology_for_host(op->target);
1511 
1512     if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1513         int i;
1514         GList *device_list = NULL;
1515         GList *iter = NULL;
1516         GList *auto_list = NULL;
1517 
1518         if (pcmk__str_eq(op->action, "on", pcmk__str_none)
1519             && (op->automatic_list != NULL)) {
1520             auto_list = g_list_copy(op->automatic_list);
1521         }
1522 
1523         /* Yep, this looks scary, nested loops all over the place.
1524          * Here is what is going on.
1525          * Loop1: Iterate through fencing levels.
1526          * Loop2: If a fencing level has devices, loop through each device
1527          * Loop3: For each device in a fencing level, see what peer owns it
1528          *        and what that peer has reported the timeout is for the device.
1529          */
1530         for (i = 0; i < ST_LEVEL_MAX; i++) {
1531             if (!tp->levels[i]) {
1532                 continue;
1533             }
1534             for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1535                 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1536                     const peer_device_info_t *peer = iter->data;
1537 
1538                     if (auto_list) {
1539                         GList *match = g_list_find_custom(auto_list, device_list->data,
1540                                         sort_strings);
1541                         if (match) {
1542                             auto_list = g_list_remove(auto_list, match->data);
1543                         }
1544                     }
1545 
1546                     if (find_peer_device(op, peer, device_list->data,
1547                                          fenced_support_flag(op->action))) {
1548                         total_timeout += get_device_timeout(op, peer,
1549                                                             device_list->data);
1550                         break;
1551                     }
1552                 }               /* End Loop3: match device with peer that owns device, find device's timeout period */
1553             }                   /* End Loop2: iterate through devices at a specific level */
1554         }                       /*End Loop1: iterate through fencing levels */
1555 
1556         //Add only exists automatic_list device timeout
1557         if (auto_list) {
1558             for (iter = auto_list; iter != NULL; iter = iter->next) {
1559                 GList *iter2 = NULL;
1560 
1561                 for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
1562                     peer_device_info_t *peer = iter2->data;
1563                     if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
1564                         total_timeout += get_device_timeout(op, peer, iter->data);
1565                         break;
1566                     }
1567                 }
1568             }
1569         }
1570 
1571         g_list_free(auto_list);
1572 
1573     } else if (chosen_peer) {
1574         total_timeout = get_peer_timeout(op, chosen_peer);
1575     } else {
1576         total_timeout = op->base_timeout;
1577     }
1578 
1579     return total_timeout ? total_timeout : op->base_timeout;
1580 }
1581 
1582 static void
1583 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1584 {
1585     GList *iter = NULL;
1586     xmlNode *update = NULL;
1587     const char *client_node = NULL;
1588     const char *client_id = NULL;
1589     const char *call_id = NULL;
1590 
1591     if (op->call_options & st_opt_sync_call) {
1592         /* There is no reason to report the timeout for a synchronous call. It
1593          * is impossible to use the reported timeout to do anything when the client
1594          * is blocking for the response.  This update is only important for
1595          * async calls that require a callback to report the results in. */
1596         return;
1597     } else if (!op->request) {
1598         return;
1599     }
1600 
1601     crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1602     client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1603     call_id = crm_element_value(op->request, F_STONITH_CALLID);
1604     client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1605     if (!client_node || !call_id || !client_id) {
1606         return;
1607     }
1608 
1609     if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1610         // Client is connected to this node, so send update directly to them
1611         do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1612         return;
1613     }
1614 
1615     /* The client is connected to another node, relay this update to them */
1616     update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1617     crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1618     crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1619     crm_xml_add(update, F_STONITH_CALLID, call_id);
1620     crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1621 
1622     send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1623 
1624     free_xml(update);
1625 
1626     for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1627         remote_fencing_op_t *dup = iter->data;
1628 
1629         crm_trace("Reporting timeout for duplicate %.8s to client %s",
1630                   dup->id, dup->client_name);
1631         report_timeout_period(iter->data, op_timeout);
1632     }
1633 }
1634 
1635 /*!
1636  * \internal
1637  * \brief Advance an operation to the next device in its topology
1638  *
1639  * \param[in,out] op      Fencer operation to advance
1640  * \param[in]     device  ID of device that just completed
1641  * \param[in,out] msg     If not NULL, XML reply of last delegated operation
1642  */
1643 static void
1644 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
     /* [previous][next][first][last][top][bottom][index][help] */
1645                                  xmlNode *msg)
1646 {
1647     /* Advance to the next device at this topology level, if any */
1648     if (op->devices) {
1649         op->devices = op->devices->next;
1650     }
1651 
1652     /* Handle automatic unfencing if an "on" action was requested */
1653     if ((op->phase == st_phase_requested)
1654         && pcmk__str_eq(op->action, "on", pcmk__str_none)) {
1655         /* If the device we just executed was required, it's not anymore */
1656         remove_required_device(op, device);
1657 
1658         /* If there are no more devices at this topology level, run through any
1659          * remaining devices with automatic unfencing
1660          */
1661         if (op->devices == NULL) {
1662             op->devices = op->automatic_list;
1663         }
1664     }
1665 
1666     if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1667         /* We're done with this level and with required devices, but we had
1668          * remapped "reboot" to "off", so start over with "on". If any devices
1669          * need to be turned back on, op->devices will be non-NULL after this.
1670          */
1671         op_phase_on(op);
1672     }
1673 
1674     // This function is only called if the previous device succeeded
1675     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1676 
1677     if (op->devices) {
1678         /* Necessary devices remain, so execute the next one */
1679         crm_trace("Next targeting %s on behalf of %s@%s",
1680                   op->target, op->client_name, op->originator);
1681 
1682         // The requested delay has been applied for the first device
1683         if (op->delay > 0) {
1684             op->delay = 0;
1685         }
1686 
1687         request_peer_fencing(op, NULL);
1688     } else {
1689         /* We're done with all devices and phases, so finalize operation */
1690         crm_trace("Marking complex fencing op targeting %s as complete",
1691                   op->target);
1692         op->state = st_done;
1693         finalize_op(op, msg, false);
1694     }
1695 }
1696 
1697 static gboolean
1698 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
1699 {
1700     if (node_does_watchdog_fencing(op->target)) {
1701 
1702         crm_notice("Waiting %lds for %s to self-fence (%s) for "
1703                    "client %s " CRM_XS " id=%.8s",
1704                    (stonith_watchdog_timeout_ms / 1000),
1705                    op->target, op->action, op->client_name, op->id);
1706         op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
1707                                          remote_op_watchdog_done, op);
1708         return TRUE;
1709     } else {
1710         crm_debug("Skipping fallback to watchdog-fencing as %s is "
1711                  "not in host-list", op->target);
1712     }
1713     return FALSE;
1714 }
1715 
1716 /*!
1717  * \internal
1718  * \brief Ask a peer to execute a fencing operation
1719  *
1720  * \param[in,out] op      Fencing operation to be executed
1721  * \param[in,out] peer    If NULL or topology is in use, choose best peer to
1722  *                        execute the fencing, otherwise use this peer
1723  */
1724 static void
1725 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1726 {
1727     const char *device = NULL;
1728     int timeout;
1729 
1730     CRM_CHECK(op != NULL, return);
1731 
1732     crm_trace("Action %.8s targeting %s for %s is %s",
1733               op->id, op->target, op->client_name,
1734               stonith_op_state_str(op->state));
1735 
1736     if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1737         /* We are in the "on" phase of a remapped topology reboot. If this
1738          * device has pcmk_reboot_action="off", or doesn't support the "on"
1739          * action, skip it.
1740          *
1741          * We can't check device properties at this point because we haven't
1742          * chosen a peer for this stage yet. Instead, we check the local node's
1743          * knowledge about the device. If different versions of the fence agent
1744          * are installed on different nodes, there's a chance this could be
1745          * mistaken, but the worst that could happen is we don't try turning the
1746          * node back on when we should.
1747          */
1748         device = op->devices->data;
1749         if (pcmk__str_eq(fenced_device_reboot_action(device), "off",
1750                          pcmk__str_none)) {
1751             crm_info("Not turning %s back on using %s because the device is "
1752                      "configured to stay off (pcmk_reboot_action='off')",
1753                      op->target, device);
1754             advance_topology_device_in_level(op, device, NULL);
1755             return;
1756         }
1757         if (!fenced_device_supports_on(device)) {
1758             crm_info("Not turning %s back on using %s because the agent "
1759                      "doesn't support 'on'", op->target, device);
1760             advance_topology_device_in_level(op, device, NULL);
1761             return;
1762         }
1763     }
1764 
1765     timeout = op->base_timeout;
1766     if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1767         peer = stonith_choose_peer(op);
1768     }
1769 
1770     if (!op->op_timer_total) {
1771         op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1772         op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1773         report_timeout_period(op, op->total_timeout);
1774         crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1775                  CRM_XS "id=%.8s",
1776                  op->total_timeout, op->target, op->client_name, op->id);
1777     }
1778 
1779     if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1780         /* Ignore the caller's peer preference if topology is in use, because
1781          * that peer might not have access to the required device. With
1782          * topology, stonith_choose_peer() removes the device from further
1783          * consideration, so the timeout must be calculated beforehand.
1784          *
1785          * @TODO Basing the total timeout on the caller's preferred peer (above)
1786          *       is less than ideal.
1787          */
1788         peer = stonith_choose_peer(op);
1789 
1790         device = op->devices->data;
1791         timeout = get_device_timeout(op, peer, device);
1792     }
1793 
1794     if (peer) {
1795         int timeout_one = 0;
1796         xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1797 
1798         crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1799         crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1800         crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1801         crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1802         crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1803         crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1804         crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1805         crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1806         crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
1807 
1808         if (device) {
1809             timeout_one = TIMEOUT_MULTIPLY_FACTOR *
1810                           get_device_timeout(op, peer, device);
1811             crm_notice("Requesting that %s perform '%s' action targeting %s "
1812                        "using %s " CRM_XS " for client %s (%ds)",
1813                        peer->host, op->action, op->target, device,
1814                        op->client_name, timeout_one);
1815             crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1816 
1817         } else {
1818             timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1819             crm_notice("Requesting that %s perform '%s' action targeting %s "
1820                        CRM_XS " for client %s (%ds, %lds)",
1821                        peer->host, op->action, op->target, op->client_name,
1822                        timeout_one, stonith_watchdog_timeout_ms);
1823         }
1824 
1825         op->state = st_exec;
1826         if (op->op_timer_one) {
1827             g_source_remove(op->op_timer_one);
1828         }
1829 
1830         if (!((stonith_watchdog_timeout_ms > 0)
1831               && (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
1832                   || (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1833                       && !pcmk__str_eq(op->action, "on", pcmk__str_none)))
1834               && check_watchdog_fencing_and_wait(op))) {
1835 
1836             /* Some thoughts about self-fencing cases reaching this point:
1837                - Actually check in check_watchdog_fencing_and_wait
1838                  shouldn't fail if STONITH_WATCHDOG_ID is
1839                  chosen as fencing-device and it being present implies
1840                  watchdog-fencing is enabled anyway
1841                - If watchdog-fencing is disabled either in general or for
1842                  a specific target - detected in check_watchdog_fencing_and_wait -
1843                  for some other kind of self-fencing we can't expect
1844                  a success answer but timeout is fine if the node doesn't
1845                  come back in between
1846                - Delicate might be the case where we have watchdog-fencing
1847                  enabled for a node but the watchdog-fencing-device isn't
1848                  explicitly chosen for suicide. Local pe-execution in sbd
1849                  may detect the node as unclean and lead to timely suicide.
1850                  Otherwise the selection of stonith-watchdog-timeout at
1851                  least is questionable.
1852              */
1853             op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1854         }
1855 
1856         send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1857         peer->tried = TRUE;
1858         free_xml(remote_op);
1859         return;
1860 
1861     } else if (op->phase == st_phase_on) {
1862         /* A remapped "on" cannot be executed, but the node was already
1863          * turned off successfully, so ignore the error and continue.
1864          */
1865         crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1866                  "after successful 'off'", device, op->target);
1867         advance_topology_device_in_level(op, device, NULL);
1868         return;
1869 
1870     } else if (op->owner == FALSE) {
1871         crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1872                 op->action, op->target, op->client_name);
1873 
1874     } else if (op->query_timer == 0) {
1875         /* We've exhausted all available peers */
1876         crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1877                  CRM_XS " state=%s", op->action, op->target, op->client_name,
1878                  stonith_op_state_str(op->state));
1879         CRM_CHECK(op->state < st_done, return);
1880         finalize_timed_out_op(op, "All nodes failed, or are unable, to "
1881                                   "fence target");
1882 
1883     } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1884         /* if the operation never left the query state,
1885          * but we have all the expected replies, then no devices
1886          * are available to execute the fencing operation. */
1887 
1888         if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
1889            STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
1890             if (check_watchdog_fencing_and_wait(op)) {
1891                 return;
1892             }
1893         }
1894 
1895         if (op->state == st_query) {
1896             crm_info("No peers (out of %d) have devices capable of fencing "
1897                      "(%s) %s for client %s " CRM_XS " state=%s",
1898                      op->replies, op->action, op->target, op->client_name,
1899                      stonith_op_state_str(op->state));
1900 
1901             pcmk__reset_result(&op->result);
1902             pcmk__set_result(&op->result, CRM_EX_ERROR,
1903                              PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1904         } else {
1905             if (pcmk_is_set(op->call_options, st_opt_topology)) {
1906                 pcmk__reset_result(&op->result);
1907                 pcmk__set_result(&op->result, CRM_EX_ERROR,
1908                                  PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1909             }
1910             /* ... else use existing result from previous failed attempt
1911              * (topology is not in use, and no devices remain to be attempted).
1912              * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would
1913              * prevent finalize_op() from setting the correct delegate if
1914              * needed.
1915              */
1916 
1917             crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1918                      "for client %s " CRM_XS " state=%s",
1919                      op->replies, op->action, op->target, op->client_name,
1920                      stonith_op_state_str(op->state));
1921         }
1922 
1923         op->state = st_failed;
1924         finalize_op(op, NULL, false);
1925 
1926     } else {
1927         crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1928                  "for client %s " CRM_XS " id=%.8s",
1929                  op->action, op->target, (device? " using " : ""),
1930                  (device? device : ""), op->client_name, op->id);
1931     }
1932 }
1933 
1934 /*!
1935  * \internal
1936  * \brief Comparison function for sorting query results
1937  *
1938  * \param[in] a  GList item to compare
1939  * \param[in] b  GList item to compare
1940  *
1941  * \return Per the glib documentation, "a negative integer if the first value
1942  *         comes before the second, 0 if they are equal, or a positive integer
1943  *         if the first value comes after the second."
1944  */
1945 static gint
1946 sort_peers(gconstpointer a, gconstpointer b)
     /* [previous][next][first][last][top][bottom][index][help] */
1947 {
1948     const peer_device_info_t *peer_a = a;
1949     const peer_device_info_t *peer_b = b;
1950 
1951     return (peer_b->ndevices - peer_a->ndevices);
1952 }
1953 
1954 /*!
1955  * \internal
1956  * \brief Determine if all the devices in the topology are found or not
1957  *
1958  * \param[in] op  Fencing operation with topology to check
1959  */
1960 static gboolean
1961 all_topology_devices_found(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
1962 {
1963     GList *device = NULL;
1964     GList *iter = NULL;
1965     device_properties_t *match = NULL;
1966     stonith_topology_t *tp = NULL;
1967     gboolean skip_target = FALSE;
1968     int i;
1969 
1970     tp = find_topology_for_host(op->target);
1971     if (!tp) {
1972         return FALSE;
1973     }
1974     if (pcmk__is_fencing_action(op->action)) {
1975         /* Don't count the devices on the target node if we are killing
1976          * the target node. */
1977         skip_target = TRUE;
1978     }
1979 
1980     for (i = 0; i < ST_LEVEL_MAX; i++) {
1981         for (device = tp->levels[i]; device; device = device->next) {
1982             match = NULL;
1983             for (iter = op->query_results; iter && !match; iter = iter->next) {
1984                 peer_device_info_t *peer = iter->data;
1985 
1986                 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1987                     continue;
1988                 }
1989                 match = find_peer_device(op, peer, device->data, st_device_supports_none);
1990             }
1991             if (!match) {
1992                 return FALSE;
1993             }
1994         }
1995     }
1996 
1997     return TRUE;
1998 }
1999 
2000 /*!
2001  * \internal
2002  * \brief Parse action-specific device properties from XML
2003  *
2004  * \param[in]     xml     XML element containing the properties
2005  * \param[in]     peer    Name of peer that sent XML (for logs)
2006  * \param[in]     device  Device ID (for logs)
2007  * \param[in]     action  Action the properties relate to (for logs)
2008  * \param[in,out] op      Fencing operation that properties are being parsed for
2009  * \param[in]     phase   Phase the properties relate to
2010  * \param[in,out] props   Device properties to update
2011  */
2012 static void
2013 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
     /* [previous][next][first][last][top][bottom][index][help] */
2014                       const char *action, remote_fencing_op_t *op,
2015                       enum st_remap_phase phase, device_properties_t *props)
2016 {
2017     props->custom_action_timeout[phase] = 0;
2018     crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
2019                           &props->custom_action_timeout[phase]);
2020     if (props->custom_action_timeout[phase]) {
2021         crm_trace("Peer %s with device %s returned %s action timeout %d",
2022                   peer, device, action, props->custom_action_timeout[phase]);
2023     }
2024 
2025     props->delay_max[phase] = 0;
2026     crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
2027     if (props->delay_max[phase]) {
2028         crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
2029                   peer, device, props->delay_max[phase], action);
2030     }
2031 
2032     props->delay_base[phase] = 0;
2033     crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
2034     if (props->delay_base[phase]) {
2035         crm_trace("Peer %s with device %s returned base delay %d for %s",
2036                   peer, device, props->delay_base[phase], action);
2037     }
2038 
2039     /* Handle devices with automatic unfencing */
2040     if (pcmk__str_eq(action, "on", pcmk__str_none)) {
2041         int required = 0;
2042 
2043         crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
2044         if (required) {
2045             crm_trace("Peer %s requires device %s to execute for action %s",
2046                       peer, device, action);
2047             add_required_device(op, device);
2048         }
2049     }
2050 
2051     /* If a reboot is remapped to off+on, it's possible that a node is allowed
2052      * to perform one action but not another.
2053      */
2054     if (pcmk__xe_attr_is_true(xml, F_STONITH_ACTION_DISALLOWED)) {
2055         props->disallowed[phase] = TRUE;
2056         crm_trace("Peer %s is disallowed from executing %s for device %s",
2057                   peer, action, device);
2058     }
2059 }
2060 
2061 /*!
2062  * \internal
2063  * \brief Parse one device's properties from peer's XML query reply
2064  *
2065  * \param[in]     xml       XML node containing device properties
2066  * \param[in,out] op        Operation that query and reply relate to
2067  * \param[in,out] peer      Peer's device information
2068  * \param[in]     device    ID of device being parsed
2069  */
2070 static void
2071 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
2072                       peer_device_info_t *peer, const char *device)
2073 {
2074     xmlNode *child;
2075     int verified = 0;
2076     device_properties_t *props = calloc(1, sizeof(device_properties_t));
2077     int flags = st_device_supports_on; /* Old nodes that don't set the flag assume they support the on action */
2078 
2079     /* Add a new entry to this peer's devices list */
2080     CRM_ASSERT(props != NULL);
2081     g_hash_table_insert(peer->devices, strdup(device), props);
2082 
2083     /* Peers with verified (monitored) access will be preferred */
2084     crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
2085     if (verified) {
2086         crm_trace("Peer %s has confirmed a verified device %s",
2087                   peer->host, device);
2088         props->verified = TRUE;
2089     }
2090 
2091     crm_element_value_int(xml, F_STONITH_DEVICE_SUPPORT_FLAGS, &flags);
2092     props->device_support_flags = flags;
2093 
2094     /* Parse action-specific device properties */
2095     parse_action_specific(xml, peer->host, device, op_requested_action(op),
2096                           op, st_phase_requested, props);
2097     for (child = pcmk__xml_first_child(xml); child != NULL;
2098          child = pcmk__xml_next(child)) {
2099         /* Replies for "reboot" operations will include the action-specific
2100          * values for "off" and "on" in child elements, just in case the reboot
2101          * winds up getting remapped.
2102          */
2103         if (pcmk__str_eq(ID(child), "off", pcmk__str_none)) {
2104             parse_action_specific(child, peer->host, device, "off",
2105                                   op, st_phase_off, props);
2106         } else if (pcmk__str_eq(ID(child), "on", pcmk__str_none)) {
2107             parse_action_specific(child, peer->host, device, "on",
2108                                   op, st_phase_on, props);
2109         }
2110     }
2111 }
2112 
2113 /*!
2114  * \internal
2115  * \brief Parse a peer's XML query reply and add it to operation's results
2116  *
2117  * \param[in,out] op        Operation that query and reply relate to
2118  * \param[in]     host      Name of peer that sent this reply
2119  * \param[in]     ndevices  Number of devices expected in reply
2120  * \param[in]     xml       XML node containing device list
2121  *
2122  * \return Newly allocated result structure with parsed reply
2123  */
2124 static peer_device_info_t *
2125 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
     /* [previous][next][first][last][top][bottom][index][help] */
2126            const xmlNode *xml)
2127 {
2128     peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t));
2129     xmlNode *child;
2130 
2131     // cppcheck seems not to understand the abort logic in CRM_CHECK
2132     // cppcheck-suppress memleak
2133     CRM_CHECK(peer != NULL, return NULL);
2134     peer->host = strdup(host);
2135     peer->devices = pcmk__strkey_table(free, free);
2136 
2137     /* Each child element describes one capable device available to the peer */
2138     for (child = pcmk__xml_first_child(xml); child != NULL;
2139          child = pcmk__xml_next(child)) {
2140         const char *device = ID(child);
2141 
2142         if (device) {
2143             add_device_properties(child, op, peer, device);
2144         }
2145     }
2146 
2147     peer->ndevices = g_hash_table_size(peer->devices);
2148     CRM_CHECK(ndevices == peer->ndevices,
2149               crm_err("Query claimed to have %d device%s but %d found",
2150                       ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2151 
2152     op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2153     return peer;
2154 }
2155 
2156 /*!
2157  * \internal
2158  * \brief Handle a peer's reply to our fencing query
2159  *
2160  * Parse a query result from XML and store it in the remote operation
2161  * table, and when enough replies have been received, issue a fencing request.
2162  *
2163  * \param[in] msg  XML reply received
2164  *
2165  * \return pcmk_ok on success, -errno on error
2166  *
2167  * \note See initiate_remote_stonith_op() for how the XML query was initially
2168  *       formed, and stonith_query() for how the peer formed its XML reply.
2169  */
2170 int
2171 process_remote_stonith_query(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
2172 {
2173     int ndevices = 0;
2174     gboolean host_is_target = FALSE;
2175     gboolean have_all_replies = FALSE;
2176     const char *id = NULL;
2177     const char *host = NULL;
2178     remote_fencing_op_t *op = NULL;
2179     peer_device_info_t *peer = NULL;
2180     uint32_t replies_expected;
2181     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2182 
2183     CRM_CHECK(dev != NULL, return -EPROTO);
2184 
2185     id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2186     CRM_CHECK(id != NULL, return -EPROTO);
2187 
2188     dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
2189     CRM_CHECK(dev != NULL, return -EPROTO);
2190     crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
2191 
2192     op = g_hash_table_lookup(stonith_remote_op_list, id);
2193     if (op == NULL) {
2194         crm_debug("Received query reply for unknown or expired operation %s",
2195                   id);
2196         return -EOPNOTSUPP;
2197     }
2198 
2199     replies_expected = fencing_active_peers();
2200     if (op->replies_expected < replies_expected) {
2201         replies_expected = op->replies_expected;
2202     }
2203     if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2204         have_all_replies = TRUE;
2205     }
2206     host = crm_element_value(msg, F_ORIG);
2207     host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2208 
2209     crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2210              op->replies, replies_expected, host,
2211              op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2212     if (ndevices > 0) {
2213         peer = add_result(op, host, ndevices, dev);
2214     }
2215 
2216     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2217 
2218     if (pcmk_is_set(op->call_options, st_opt_topology)) {
2219         /* If we start the fencing before all the topology results are in,
2220          * it is possible fencing levels will be skipped because of the missing
2221          * query results. */
2222         if (op->state == st_query && all_topology_devices_found(op)) {
2223             /* All the query results are in for the topology, start the fencing ops. */
2224             crm_trace("All topology devices found");
2225             request_peer_fencing(op, peer);
2226 
2227         } else if (have_all_replies) {
2228             crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2229                      replies_expected, op->replies);
2230             request_peer_fencing(op, NULL);
2231         }
2232 
2233     } else if (op->state == st_query) {
2234         int nverified = count_peer_devices(op, peer, TRUE,
2235                                            fenced_support_flag(op->action));
2236 
2237         /* We have a result for a non-topology fencing op that looks promising,
2238          * go ahead and start fencing before query timeout */
2239         if ((peer != NULL) && !host_is_target && nverified) {
2240             /* we have a verified device living on a peer that is not the target */
2241             crm_trace("Found %d verified device%s",
2242                       nverified, pcmk__plural_s(nverified));
2243             request_peer_fencing(op, peer);
2244 
2245         } else if (have_all_replies) {
2246             crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2247                      replies_expected, op->replies);
2248             request_peer_fencing(op, NULL);
2249 
2250         } else {
2251             crm_trace("Waiting for more peer results before launching fencing operation");
2252         }
2253 
2254     } else if ((peer != NULL) && (op->state == st_done)) {
2255         crm_info("Discarding query result from %s (%d device%s): "
2256                  "Operation is %s", peer->host,
2257                  peer->ndevices, pcmk__plural_s(peer->ndevices),
2258                  stonith_op_state_str(op->state));
2259     }
2260 
2261     return pcmk_ok;
2262 }
2263 
2264 /*!
2265  * \internal
2266  * \brief Handle a peer's reply to a fencing request
2267  *
2268  * Parse a fencing reply from XML, and either finalize the operation
2269  * or attempt another device as appropriate.
2270  *
2271  * \param[in] msg  XML reply received
2272  */
2273 void
2274 fenced_process_fencing_reply(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
2275 {
2276     const char *id = NULL;
2277     const char *device = NULL;
2278     remote_fencing_op_t *op = NULL;
2279     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2280     pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2281 
2282     CRM_CHECK(dev != NULL, return);
2283 
2284     id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2285     CRM_CHECK(id != NULL, return);
2286 
2287     dev = stonith__find_xe_with_result(msg);
2288     CRM_CHECK(dev != NULL, return);
2289 
2290     stonith__xe_get_result(dev, &result);
2291 
2292     device = crm_element_value(dev, F_STONITH_DEVICE);
2293 
2294     if (stonith_remote_op_list) {
2295         op = g_hash_table_lookup(stonith_remote_op_list, id);
2296     }
2297 
2298     if ((op == NULL) && pcmk__result_ok(&result)) {
2299         /* Record successful fencing operations */
2300         const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2301 
2302         op = create_remote_stonith_op(client_id, dev, TRUE);
2303     }
2304 
2305     if (op == NULL) {
2306         /* Could be for an event that began before we started */
2307         /* TODO: Record the op for later querying */
2308         crm_info("Received peer result of unknown or expired operation %s", id);
2309         pcmk__reset_result(&result);
2310         return;
2311     }
2312 
2313     pcmk__reset_result(&op->result);
2314     op->result = result; // The operation takes ownership of the result
2315 
2316     if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2317         crm_err("Received outdated reply for device %s (instead of %s) to "
2318                 "fence (%s) %s. Operation already timed out at peer level.",
2319                 device, (const char *) op->devices->data, op->action, op->target);
2320         return;
2321     }
2322 
2323     if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2324         if (pcmk__result_ok(&op->result)) {
2325             op->state = st_done;
2326         } else {
2327             op->state = st_failed;
2328         }
2329         finalize_op(op, msg, false);
2330         return;
2331 
2332     } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2333         /* If this isn't a remote level broadcast, and we are not the
2334          * originator of the operation, we should not be receiving this msg. */
2335         crm_err("Received non-broadcast fencing result for operation %.8s "
2336                 "we do not own (device %s targeting %s)",
2337                 op->id, device, op->target);
2338         return;
2339     }
2340 
2341     if (pcmk_is_set(op->call_options, st_opt_topology)) {
2342         const char *device = NULL;
2343         const char *reason = op->result.exit_reason;
2344 
2345         /* We own the op, and it is complete. broadcast the result to all nodes
2346          * and notify our local clients. */
2347         if (op->state == st_done) {
2348             finalize_op(op, msg, false);
2349             return;
2350         }
2351 
2352         device = crm_element_value(msg, F_STONITH_DEVICE);
2353 
2354         if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2355             /* A remapped "on" failed, but the node was already turned off
2356              * successfully, so ignore the error and continue.
2357              */
2358             crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2359                      "after successful 'off'",
2360                      device, pcmk_exec_status_str(op->result.execution_status),
2361                      (reason == NULL)? "" : ": ",
2362                      (reason == NULL)? "" : reason,
2363                      op->target);
2364             pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2365         } else {
2366             crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2367                        "%s%s%s%s",
2368                        op->action, op->target,
2369                        ((device == NULL)? "" : " using "),
2370                        ((device == NULL)? "" : device),
2371                        op->client_name,
2372                        op->originator,
2373                        pcmk_exec_status_str(op->result.execution_status),
2374                        (reason == NULL)? "" : " (",
2375                        (reason == NULL)? "" : reason,
2376                        (reason == NULL)? "" : ")");
2377         }
2378 
2379         if (pcmk__result_ok(&op->result)) {
2380             /* An operation completed successfully. Try another device if
2381              * necessary, otherwise mark the operation as done. */
2382             advance_topology_device_in_level(op, device, msg);
2383             return;
2384         } else {
2385             /* This device failed, time to try another topology level. If no other
2386              * levels are available, mark this operation as failed and report results. */
2387             if (advance_topology_level(op, false) != pcmk_rc_ok) {
2388                 op->state = st_failed;
2389                 finalize_op(op, msg, false);
2390                 return;
2391             }
2392         }
2393 
2394     } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2395         op->state = st_done;
2396         finalize_op(op, msg, false);
2397         return;
2398 
2399     } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2400                && (op->devices == NULL)) {
2401         /* If the operation timed out don't bother retrying other peers. */
2402         op->state = st_failed;
2403         finalize_op(op, msg, false);
2404         return;
2405 
2406     } else {
2407         /* fall-through and attempt other fencing action using another peer */
2408     }
2409 
2410     /* Retry on failure */
2411     crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2412               op->target, op->originator, op->client_name,
2413               pcmk_exec_status_str(op->result.execution_status));
2414     request_peer_fencing(op, NULL);
2415 }
2416 
2417 gboolean
2418 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
2419 {
2420     GHashTableIter iter;
2421     time_t now = time(NULL);
2422     remote_fencing_op_t *rop = NULL;
2423 
2424     if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2425         action == NULL) {
2426         return FALSE;
2427     }
2428 
2429     g_hash_table_iter_init(&iter, stonith_remote_op_list);
2430     while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2431         if (strcmp(rop->target, target) != 0) {
2432             continue;
2433         } else if (rop->state != st_done) {
2434             continue;
2435         /* We don't have to worry about remapped reboots here
2436          * because if state is done, any remapping has been undone
2437          */
2438         } else if (strcmp(rop->action, action) != 0) {
2439             continue;
2440         } else if ((rop->completed + tolerance) < now) {
2441             continue;
2442         }
2443 
2444         crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2445                    target, action, tolerance, rop->delegate, rop->originator);
2446         return TRUE;
2447     }
2448     return FALSE;
2449 }

/* [previous][next][first][last][top][bottom][index][help] */