use a resource pool of preallocated register buffers on the Stack

This commit is contained in:
Devyn Cairns 2024-06-12 03:40:15 -07:00
parent 352095a3b8
commit aa328d608e
5 changed files with 94 additions and 77 deletions

View File

@ -1,4 +1,5 @@
use nu_engine::{command_prelude::*, compile};
use nu_protocol::engine::Closure;
#[derive(Clone)]
pub struct ViewIr;
@ -10,33 +11,26 @@ impl Command for ViewIr {
fn signature(&self) -> Signature {
Signature::new(self.name()).required(
"block",
SyntaxShape::Block,
"the block to see compiled code for",
"closure",
SyntaxShape::Closure(None),
"the closure to see compiled code for",
)
}
fn usage(&self) -> &str {
"View the compiled IR code for a block"
"View the compiled IR code for a block of code"
}
fn run(
&self,
engine_state: &EngineState,
_stack: &mut Stack,
stack: &mut Stack,
call: &Call,
_input: PipelineData,
) -> Result<PipelineData, ShellError> {
let expr = call
.positional_nth(0)
.ok_or_else(|| ShellError::AccessEmptyContent { span: call.head })?;
let closure: Closure = call.req(engine_state, stack, 0)?;
let block_id = expr.as_block().ok_or_else(|| ShellError::TypeMismatch {
err_message: "expected block".into(),
span: expr.span,
})?;
let block = engine_state.get_block(block_id);
let block = engine_state.get_block(closure.block_id);
let ir_block = compile(&StateWorkingSet::new(engine_state), &block)?;
let formatted = format!("{}", ir_block.display(engine_state));

View File

@ -18,24 +18,20 @@ pub fn eval_ir_block<D: DebugContext>(
let block_span = block.span;
// Allocate required space for registers. We prefer to allocate on the stack, but will
// allocate on the heap if it's over the compiled maximum size
//
// Keep in mind that there is some code generated for each variant; at least at the moment
// it doesn't seem like LLVM is able to optimize this away
//
// This is organized like a tree to try to make sure we do the fewest number of branches
let result = if ir_block.register_count <= 8 {
if ir_block.register_count <= 4 {
eval_ir_block_static::<D, 4>(engine_state, stack, &block_span, ir_block, input)
} else {
eval_ir_block_static::<D, 8>(engine_state, stack, &block_span, ir_block, input)
}
} else if ir_block.register_count <= 16 {
eval_ir_block_static::<D, 16>(engine_state, stack, &block_span, ir_block, input)
} else {
eval_ir_block_dynamic::<D>(engine_state, stack, &block_span, ir_block, input)
};
let mut registers = stack.register_buf_cache.acquire(ir_block.register_count);
let result = eval_ir_block_impl::<D>(
&mut EvalContext {
engine_state,
stack,
registers: &mut registers[..],
},
&block_span,
ir_block,
input,
);
stack.register_buf_cache.release(registers);
D::leave_block(engine_state, block);
@ -52,51 +48,6 @@ pub fn eval_ir_block<D: DebugContext>(
}
}
/// Eval an IR block with stack-allocated registers, the size of which must be known statically.
fn eval_ir_block_static<D: DebugContext, const N: usize>(
engine_state: &EngineState,
stack: &mut Stack,
block_span: &Option<Span>,
ir_block: &IrBlock,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
log::trace!(
"entering block with {} registers on stack ({} requested)",
N,
ir_block.register_count
);
const EMPTY: PipelineData = PipelineData::Empty;
let mut array = [EMPTY; N];
let mut ctx = EvalContext {
engine_state,
stack,
registers: &mut array[..],
};
eval_ir_block_impl::<D>(&mut ctx, block_span, ir_block, input)
}
/// Eval an IR block with heap-allocated registers.
fn eval_ir_block_dynamic<D: DebugContext>(
engine_state: &EngineState,
stack: &mut Stack,
block_span: &Option<Span>,
ir_block: &IrBlock,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
log::trace!(
"entering block with {} registers on heap",
ir_block.register_count
);
let mut vec = Vec::with_capacity(ir_block.register_count);
vec.extend(std::iter::repeat_with(|| PipelineData::Empty).take(ir_block.register_count));
let mut ctx = EvalContext {
engine_state,
stack,
registers: &mut vec[..],
};
eval_ir_block_impl::<D>(&mut ctx, block_span, ir_block, input)
}
/// All of the pointers necessary for evaluation
struct EvalContext<'a> {
engine_state: &'a EngineState,

View File

@ -5,6 +5,7 @@ mod command;
mod engine_state;
mod overlay;
mod pattern_match;
mod register_buf_cache;
mod stack;
mod stack_out_dest;
mod state_delta;
@ -20,6 +21,7 @@ pub use command::*;
pub use engine_state::*;
pub use overlay::*;
pub use pattern_match::*;
pub use register_buf_cache::*;
pub use stack::*;
pub use stack_out_dest::*;
pub use state_delta::*;

View File

@ -0,0 +1,62 @@
use std::fmt;
use crate::PipelineData;
/// Retains buffers for reuse in IR evaluation, avoiding heap allocation.
///
/// This is implemented in such a way that [`Clone`] is still possible, by making the fact that the
/// buffers can't be preserved on clone completely transparent. The cached buffers are always empty.
pub struct RegisterBufCache {
bufs: Vec<Vec<PipelineData>>,
}
// SAFETY: because `bufs` only ever contains empty `Vec`s, it doesn't actually contain any of the
// data.
unsafe impl Send for RegisterBufCache {}
unsafe impl Sync for RegisterBufCache {}
impl RegisterBufCache {
/// Create a new cache with no register buffers.
pub const fn new() -> Self {
RegisterBufCache { bufs: vec![] }
}
/// Acquire a new register buffer from the cache. The buffer will be extended to `size` with
/// [`Empty`](PipelineData::Empty) elements.
pub fn acquire(&mut self, size: usize) -> Vec<PipelineData> {
let mut buf = if let Some(buf) = self.bufs.pop() {
debug_assert!(buf.is_empty());
buf
} else {
Vec::new()
};
buf.reserve(size);
buf.extend(std::iter::repeat_with(|| PipelineData::Empty).take(size));
buf
}
/// Release a used register buffer to the cache. The buffer will be cleared.
pub fn release(&mut self, mut buf: Vec<PipelineData>) {
// SAFETY: this `clear` is necessary for the `unsafe impl`s to be safe
buf.clear();
self.bufs.push(buf);
}
}
impl fmt::Debug for RegisterBufCache {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let bufs = self.bufs.len();
let bytes: usize = self
.bufs
.iter()
.map(|b| b.capacity() * std::mem::size_of::<PipelineData>())
.sum();
write!(f, "RegisterBufCache({bufs} bufs, {bytes} bytes)")
}
}
impl Clone for RegisterBufCache {
fn clone(&self) -> Self {
RegisterBufCache::new()
}
}

View File

@ -11,6 +11,8 @@ use std::{
sync::Arc,
};
use super::RegisterBufCache;
/// Environment variables per overlay
pub type EnvVars = HashMap<String, HashMap<String, Value>>;
@ -41,6 +43,8 @@ pub struct Stack {
pub env_hidden: HashMap<String, HashSet<String>>,
/// List of active overlays
pub active_overlays: Vec<String>,
/// Cached register buffers for IR evaluation
pub register_buf_cache: RegisterBufCache,
pub recursion_count: u64,
pub parent_stack: Option<Arc<Stack>>,
/// Variables that have been deleted (this is used to hide values from parent stack lookups)
@ -68,6 +72,7 @@ impl Stack {
env_vars: Vec::new(),
env_hidden: HashMap::new(),
active_overlays: vec![DEFAULT_OVERLAY_NAME.to_string()],
register_buf_cache: RegisterBufCache::new(),
recursion_count: 0,
parent_stack: None,
parent_deletions: vec![],
@ -85,6 +90,7 @@ impl Stack {
env_vars: parent.env_vars.clone(),
env_hidden: parent.env_hidden.clone(),
active_overlays: parent.active_overlays.clone(),
register_buf_cache: RegisterBufCache::new(),
recursion_count: parent.recursion_count,
vars: vec![],
parent_deletions: vec![],
@ -254,6 +260,7 @@ impl Stack {
env_vars,
env_hidden: self.env_hidden.clone(),
active_overlays: self.active_overlays.clone(),
register_buf_cache: RegisterBufCache::new(),
recursion_count: self.recursion_count,
parent_stack: None,
parent_deletions: vec![],
@ -284,6 +291,7 @@ impl Stack {
env_vars,
env_hidden: self.env_hidden.clone(),
active_overlays: self.active_overlays.clone(),
register_buf_cache: RegisterBufCache::new(),
recursion_count: self.recursion_count,
parent_stack: None,
parent_deletions: vec![],