diff --git a/Makefile b/Makefile
index 258e5a111..ddf29f0cd 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,7 @@
 #   USE_WURFL            : enable WURFL detection library from Scientiamobile
 #   USE_SYSTEMD          : enable sd_notify() support.
 #   USE_OBSOLETE_LINKER  : use when the linker fails to emit __start_init/__stop_init
+#   USE_THREAD_DUMP      : use the more advanced thread state dump system. Automatic.
 #
 # Options can be forced by specifying "USE_xxx=1" or can be disabled by using
 # "USE_xxx=" (empty string).
@@ -284,7 +285,7 @@ use_opts = USE_EPOLL USE_KQUEUE USE_MY_EPOLL USE_MY_SPLICE USE_NETFILTER      \
            USE_GETADDRINFO USE_OPENSSL USE_LUA USE_FUTEX USE_ACCEPT4          \
            USE_MY_ACCEPT4 USE_ZLIB USE_SLZ USE_CPU_AFFINITY USE_TFO USE_NS    \
            USE_DL USE_RT USE_DEVICEATLAS USE_51DEGREES USE_WURFL USE_SYSTEMD  \
-           USE_OBSOLETE_LINKER USE_PRCTL
+           USE_OBSOLETE_LINKER USE_PRCTL USE_THREAD_DUMP
 
 #### Target system options
 # Depending on the target platform, some options are set, as well as some
@@ -343,7 +344,7 @@ ifeq ($(TARGET),linux2628)
   set_target_defaults = $(call default_opts, \
     USE_POLL USE_TPROXY USE_LIBCRYPT USE_DL USE_RT USE_CRYPT_H USE_NETFILTER  \
     USE_CPU_AFFINITY USE_THREAD USE_EPOLL USE_FUTEX USE_LINUX_TPROXY          \
-    USE_ACCEPT4 USE_LINUX_SPLICE USE_PRCTL ASSUME_SPLICE_WORKS)
+    USE_ACCEPT4 USE_LINUX_SPLICE USE_PRCTL ASSUME_SPLICE_WORKS USE_THREAD_DUMP)
 endif
 
 # Solaris 8 and above
diff --git a/doc/management.txt b/doc/management.txt
index ecaf72941..06c42877b 100644
--- a/doc/management.txt
+++ b/doc/management.txt
@@ -2515,9 +2515,13 @@ show table <name> [ data.<type> <operator> <value> ] | [ key <key> ]
 show threads
   Dumps some internal states and structures for each thread, that may be useful
   to help developers understand a problem. The output tries to be readable by
-  showing one block per thread, with a bit more info for the current thread.
-  The output format is purposely not documented so that it can easily evolve
-  as new needs are identified, without having to maintain any backwards
+  showing one block per thread. When haproxy is built with USE_THREAD_DUMP=1,
+  an advanced dump mechanism involving thread signals is used so that each
+  thread can dump its own state in turn. Without this option, the thread
+  processing the command shows all its details but the other ones are less
+  detailed. A stat ('*') is displayed in front of the thread handling the
+  command. The output format is purposely not documented so that it can easily
+  evolve as new needs are identified, without having to maintain any backwards
   compatibility, and just like with "show activity", the values are only
   meaningful with the code at hand.
 
diff --git a/include/common/debug.h b/include/common/debug.h
index 3fb96c529..4f3baedb7 100644
--- a/include/common/debug.h
+++ b/include/common/debug.h
@@ -86,6 +86,7 @@ struct task;
 struct buffer;
 void ha_task_dump(struct buffer *buf, const struct task *task, const char *pfx);
 void ha_thread_dump(struct buffer *buf, int thr, int calling_tid);
+void ha_thread_dump_all_to_trash();
 
 /* This one is useful to automatically apply poisonning on an area returned
  * by malloc(). Only "p_" is required to make it work, and to define a poison
diff --git a/src/debug.c b/src/debug.c
index 282985183..51ded9ed7 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -125,10 +125,7 @@ static int cli_io_handler_show_threads(struct appctx *appctx)
 		thr = 0;
 
 	chunk_reset(&trash);
-	while (thr < global.nbthread) {
-		ha_thread_dump(&trash, thr, tid);
-		thr++;
-	}
+	ha_thread_dump_all_to_trash();
 
 	if (ci_putchk(si_ic(si), &trash) == -1) {
 		/* failed, try again */
@@ -139,6 +136,128 @@ static int cli_io_handler_show_threads(struct appctx *appctx)
 	return 1;
 }
 
+#ifndef USE_THREAD_DUMP
+
+/* This function dumps all threads' state to the trash. This version is the
+ * most basic one, which doesn't inspect other threads.
+ */
+void ha_thread_dump_all_to_trash()
+{
+	unsigned int thr;
+
+	for (thr = 0; thr < global.nbthread; thr++)
+		ha_thread_dump(&trash, thr, tid);
+}
+
+#else /* below USE_THREAD_DUMP is set */
+
+/* The signal to trigger a debug dump on a thread is SIGPWR */
+#define DEBUGSIG SIGPWR
+
+/* mask of threads still having to dump, used to respect ordering */
+static volatile unsigned long threads_to_dump;
+
+/* ID of the thread requesting the dump */
+static unsigned int thread_dump_tid;
+
+/* points to the buffer where the dump functions should write. It must
+ * have already been initialized by the requester. Nothing is done if
+ * it's NULL.
+ */
+struct buffer *thread_dump_buffer = NULL;
+
+void ha_thread_dump_all_to_trash()
+{
+	__maybe_unused unsigned int thr;
+	unsigned long old;
+
+	while (1) {
+		old = 0;
+		if (HA_ATOMIC_CAS(&threads_to_dump, &old, all_threads_mask))
+			break;
+		ha_thread_relax();
+	}
+
+	thread_dump_buffer = &trash;
+	thread_dump_tid = tid;
+
+#ifdef USE_THREAD
+	for (thr = 0; thr < global.nbthread; thr++) {
+		if (thr != tid)
+			pthread_kill(threads[thr], DEBUGSIG);
+	}
+#endif
+	/* dump ourselves last */
+	raise(DEBUGSIG);
+}
+
+/* handles DEBUGSIG to dump the state of the thread it's working on */
+void debug_handler(int sig, siginfo_t *si, void *arg)
+{
+	/* There are 4 phases in the dump process:
+	 *   1- wait for our turn, i.e. when all lower bits are gone.
+	 *   2- perform the action if our bit is set
+	 *   3- remove our bit to let the next one go, unless we're
+	 *      the last one and have to put them all but ours
+	 *   4- wait for zero and clear our bit if it's set
+	 */
+
+	/* wait for all previous threads to finish first */
+	while (threads_to_dump & (tid_bit - 1))
+		ha_thread_relax();
+
+	/* dump if needed */
+	if (threads_to_dump & tid_bit) {
+		if (thread_dump_buffer)
+			ha_thread_dump(thread_dump_buffer, tid, thread_dump_tid);
+		if ((threads_to_dump & all_threads_mask) == tid_bit) {
+			/* last one */
+			HA_ATOMIC_STORE(&threads_to_dump, all_threads_mask & ~tid_bit);
+			thread_dump_buffer = NULL;
+		}
+		else
+			HA_ATOMIC_AND(&threads_to_dump, ~tid_bit);
+	}
+
+	/* now wait for all others to finish dumping. The last one will set all
+	 * bits again to broadcast the leaving condition.
+	 */
+	while (threads_to_dump & all_threads_mask) {
+		if (threads_to_dump & tid_bit)
+			HA_ATOMIC_AND(&threads_to_dump, ~tid_bit);
+		else
+			ha_thread_relax();
+	}
+}
+
+static int init_debug_per_thread()
+{
+	sigset_t set;
+
+	/* unblock the DEBUGSIG signal we intend to use */
+	sigemptyset(&set);
+	sigaddset(&set, DEBUGSIG);
+	ha_sigmask(SIG_UNBLOCK, &set, NULL);
+	return 1;
+}
+
+static int init_debug()
+{
+	struct sigaction sa;
+
+	sa.sa_handler = NULL;
+	sa.sa_sigaction = debug_handler;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = SA_SIGINFO;
+	sigaction(DEBUGSIG, &sa, NULL);
+	return 0;
+}
+
+REGISTER_POST_CHECK(init_debug);
+REGISTER_PER_THREAD_INIT(init_debug_per_thread);
+
+#endif /* USE_THREAD_DUMP */
+
 /* register cli keywords */
 static struct cli_kw_list cli_kws = {{ },{
 	{ { "show", "threads", NULL },    "show threads   : show some threads debugging information",   NULL, cli_io_handler_show_threads, NULL },