Skip to the content.

QEMU Exclusive Work

TB flush

void tb_flush(CPUState *cpu)
{
    if (tcg_enabled()) {
        unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);

        if (cpu_in_exclusive_context(cpu)) {
            do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
        } else {
            async_safe_run_on_cpu(cpu, do_tb_flush,
                                  RUN_ON_CPU_HOST_INT(tb_flush_count));
        }
    }
}
tb_flush()
-> async_save_run_on_cpu()
   -> queue_work_on_cpu()
      -> qemu_cpu_kick()
         -> mttcg_kick_vcpu_thread()
            -> cpu_exit()
               -> qatomic_set(&FLAG, -1) // FLAG will be checked inside each TB

void qemu_cpu_kick(CPUState *cpu)
{
    qemu_cond_broadcast(cpu->halt_cond);
    if (cpus_accel->kick_vcpu_thread) {
        cpus_accel->kick_vcpu_thread(cpu);
    } else { /* default */
        cpus_kick_thread(cpu);
    }
}

void mttcg_kick_vcpu_thread(CPUState *cpu)
{
    cpu_exit(cpu);
}

void cpu_exit(CPUState *cpu)
{
    qatomic_set(&cpu->exit_request, 1);
    /* Ensure cpu_exec will see the exit request after TCG has exited.  */
    smp_wmb();
    qatomic_set(&cpu->icount_decr_ptr->u16.high, -1);    
}

execution trace

cpu_exec()
-> cpu_exec_enter(cpu);
-> sigsetjmp()
-> while (!cpu_handle_exception())
|  -> while (!cpu_handle_interrupt())
|  |  -> tb = tb_find()
|  |  |  -> tb = tb_lookup()
|  |  |  -> if (tb == NULL)
|  |  |  |  -> tb = tb_gen_code()
|  |  |  |  |  -> tb = tcg_tb_alloc()
|  |  |  |  |  -> if (!tb) // no more space to alloc TB
|  |  |  |  |  |  -> tb_flush(cpu)
|  |  |  |  |  |  -> cpu->exception_index = EXCP_INTERRUPT
|  |  |  |  |  |  -> cpu_loop_exit(cpu) // siglongjmp()
|  |  |  |  -> set into jmp cache
|  |  |  -> tb_add_jump()
|  |  -> cpu_loop_exec_tb()
-> cpu_exec_exit()

execution graph

/* CPU exec */

( cpu_exec_start )  +-----> ( sigsetjmp ) <------------------------+
      1 |         2 |           3 | 8                            7 |
   ( cpu_exec ) ----+   9         |                                |
        | 10 <-------------( handle_excp )                         |
(  cpu_exec_end  )              4 |        5                       |
                             ( tb_flush ) --> ( cpu->excp_index )  |
                                                     6 |           |
                                              ( cpu_loop_exit  ) --+
/* mttcg thread */

( tcg_cpus_exec ) // start + exec + end
     loop 
( wait_io_event ) -------> ( start_exclusive )
                                    
                              ( wi->func )
                                    
                            ( end_exclusive ) 

start or end exclusive

/* Wait for pending exclusive operations to complete.  The CPU list lock
   must be held.  */
static inline void exclusive_idle(void)
{
    while (pending_cpus) {
        qemu_cond_wait(&exclusive_resume, &qemu_cpu_list_lock);
    } 
}

start exclusive

void start_exclusive(void)
{
    CPUState *other_cpu;
    int running_cpus;

    qemu_mutex_lock(&qemu_cpu_list_lock);
    exclusive_idle();

    /* Make all other cpus stop executing.  */
    qatomic_set(&pending_cpus, 1);

    /* Write pending_cpus before reading other_cpu->running.  */
    smp_mb();
    running_cpus = 0;
    CPU_FOREACH(other_cpu) {
        if (qatomic_read(&other_cpu->running)) {
            other_cpu->has_waiter = true;
            running_cpus++;
            qemu_cpu_kick(other_cpu);
        }
    }

    qatomic_set(&pending_cpus, running_cpus + 1);
    while (pending_cpus > 1) {
        qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
    }

    /* Can release mutex, no one will enter another exclusive
     * section until end_exclusive resets pending_cpus to 0.
     */
    qemu_mutex_unlock(&qemu_cpu_list_lock);

    current_cpu->in_exclusive_context = true;
}

end exclusive

/* Finish an exclusive operation.  */
void end_exclusive(void)
{
    current_cpu->in_exclusive_context = false;

    qemu_mutex_lock(&qemu_cpu_list_lock);
    qatomic_set(&pending_cpus, 0);
    qemu_cond_broadcast(&exclusive_resume);
    qemu_mutex_unlock(&qemu_cpu_list_lock);
}

cpu-exec

cpu exec start

void cpu_exec_start(CPUState *cpu)
{
    qatomic_set(&cpu->running, true);
    
    /* Write cpu->running before reading pending_cpus.  */
    smp_mb();

    if (unlikely(qatomic_read(&pending_cpus))) {
        QEMU_LOCK_GUARD(&qemu_cpu_list_lock);
        if (!cpu->has_waiter) {
            /* Not counted in pending_cpus, let the exclusive item
             * run.  Since we have the lock, just set cpu->running to true
             * while holding it; no need to check pending_cpus again.
             */
            qatomic_set(&cpu->running, false);
            exclusive_idle();
            /* Now pending_cpus is zero.  */
            qatomic_set(&cpu->running, true);
        } else {
            /* Counted in pending_cpus, go ahead and release the
             * waiter at cpu_exec_end.
             */
        }
	}
}

cpu exec end

void cpu_exec_end(CPUState *cpu)
{
    qatomic_set(&cpu->running, false);

    /* Write cpu->running before reading pending_cpus.  */
    smp_mb();

    if (unlikely(qatomic_read(&pending_cpus))) {
        QEMU_LOCK_GUARD(&qemu_cpu_list_lock);
        if (cpu->has_waiter) {
            cpu->has_waiter = false;
            qatomic_set(&pending_cpus, pending_cpus - 1);
            if (pending_cpus == 1) {
                qemu_cond_signal(&exclusive_cond);
            }
        }
    }
}

locks, conditions and certain variables

qemu cpu list lock

pending cpus

usage of pthead cond

thread 1:
    pthread_mutex_lock(&mutex);
    while (!condition)
        pthread_cond_wait(&cond, &mutex);
    /* do something that requires holding the mutex and condition is true */
    pthread_mutex_unlock(&mutex);

thread2:
    pthread_mutex_lock(&mutex);
    /* do something that might make condition true */
    pthread_cond_signal(&cond);
    pthread_mutex_unlock(&mutex);

case study

the simplest one

not counted vCPU1

vCPU1 wants to enter exclusive context

when vCPU1 starts the start_exclusive() it will execute the exclusive_idle() to wait for vCPU0 finished its exclusive work