use a resource pool of preallocated register buffers on the Stack

2024-06-12 03:40:15 -07:00 · 2024-06-12 03:40:15 -07:00 · aa328d608e
commit aa328d608e
parent 352095a3b8
5 changed files with 94 additions and 77 deletions
--- a/crates/nu-command/src/debug/view_ir.rs
+++ b/crates/nu-command/src/debug/view_ir.rs
@ -1,4 +1,5 @@
 use nu_engine::{command_prelude::*, compile};
+use nu_protocol::engine::Closure;

 #[derive(Clone)]
 pub struct ViewIr;
@ -10,33 +11,26 @@ impl Command for ViewIr {

    fn signature(&self) -> Signature {
        Signature::new(self.name()).required(
-            "block",
-            SyntaxShape::Block,
-            "the block to see compiled code for",
+            "closure",
+            SyntaxShape::Closure(None),
+            "the closure to see compiled code for",
        )
    }

    fn usage(&self) -> &str {
-        "View the compiled IR code for a block"
+        "View the compiled IR code for a block of code"
    }

    fn run(
        &self,
        engine_state: &EngineState,
-        _stack: &mut Stack,
+        stack: &mut Stack,
        call: &Call,
        _input: PipelineData,
    ) -> Result<PipelineData, ShellError> {
-        let expr = call
-            .positional_nth(0)
-            .ok_or_else(|| ShellError::AccessEmptyContent { span: call.head })?;
+        let closure: Closure = call.req(engine_state, stack, 0)?;

-        let block_id = expr.as_block().ok_or_else(|| ShellError::TypeMismatch {
-            err_message: "expected block".into(),
-            span: expr.span,
-        })?;
-
-        let block = engine_state.get_block(block_id);
+        let block = engine_state.get_block(closure.block_id);
        let ir_block = compile(&StateWorkingSet::new(engine_state), &block)?;

        let formatted = format!("{}", ir_block.display(engine_state));
--- a/crates/nu-engine/src/eval_ir.rs
+++ b/crates/nu-engine/src/eval_ir.rs
@ -18,24 +18,20 @@ pub fn eval_ir_block<D: DebugContext>(

        let block_span = block.span;

-        // Allocate required space for registers. We prefer to allocate on the stack, but will
-        // allocate on the heap if it's over the compiled maximum size
-        //
-        // Keep in mind that there is some code generated for each variant; at least at the moment
-        // it doesn't seem like LLVM is able to optimize this away
-        //
-        // This is organized like a tree to try to make sure we do the fewest number of branches
-        let result = if ir_block.register_count <= 8 {
-            if ir_block.register_count <= 4 {
-                eval_ir_block_static::<D, 4>(engine_state, stack, &block_span, ir_block, input)
-            } else {
-                eval_ir_block_static::<D, 8>(engine_state, stack, &block_span, ir_block, input)
-            }
-        } else if ir_block.register_count <= 16 {
-            eval_ir_block_static::<D, 16>(engine_state, stack, &block_span, ir_block, input)
-        } else {
-            eval_ir_block_dynamic::<D>(engine_state, stack, &block_span, ir_block, input)
-        };
+        let mut registers = stack.register_buf_cache.acquire(ir_block.register_count);
+
+        let result = eval_ir_block_impl::<D>(
+            &mut EvalContext {
+                engine_state,
+                stack,
+                registers: &mut registers[..],
+            },
+            &block_span,
+            ir_block,
+            input,
+        );
+
+        stack.register_buf_cache.release(registers);

        D::leave_block(engine_state, block);

@ -52,51 +48,6 @@ pub fn eval_ir_block<D: DebugContext>(
    }
 }

-/// Eval an IR block with stack-allocated registers, the size of which must be known statically.
-fn eval_ir_block_static<D: DebugContext, const N: usize>(
-    engine_state: &EngineState,
-    stack: &mut Stack,
-    block_span: &Option<Span>,
-    ir_block: &IrBlock,
-    input: PipelineData,
-) -> Result<PipelineData, ShellError> {
-    log::trace!(
-        "entering block with {} registers on stack ({} requested)",
-        N,
-        ir_block.register_count
-    );
-    const EMPTY: PipelineData = PipelineData::Empty;
-    let mut array = [EMPTY; N];
-    let mut ctx = EvalContext {
-        engine_state,
-        stack,
-        registers: &mut array[..],
-    };
-    eval_ir_block_impl::<D>(&mut ctx, block_span, ir_block, input)
-}
-
-/// Eval an IR block with heap-allocated registers.
-fn eval_ir_block_dynamic<D: DebugContext>(
-    engine_state: &EngineState,
-    stack: &mut Stack,
-    block_span: &Option<Span>,
-    ir_block: &IrBlock,
-    input: PipelineData,
-) -> Result<PipelineData, ShellError> {
-    log::trace!(
-        "entering block with {} registers on heap",
-        ir_block.register_count
-    );
-    let mut vec = Vec::with_capacity(ir_block.register_count);
-    vec.extend(std::iter::repeat_with(|| PipelineData::Empty).take(ir_block.register_count));
-    let mut ctx = EvalContext {
-        engine_state,
-        stack,
-        registers: &mut vec[..],
-    };
-    eval_ir_block_impl::<D>(&mut ctx, block_span, ir_block, input)
-}
-
 /// All of the pointers necessary for evaluation
 struct EvalContext<'a> {
    engine_state: &'a EngineState,
--- a/crates/nu-protocol/src/engine/mod.rs
+++ b/crates/nu-protocol/src/engine/mod.rs
@ -5,6 +5,7 @@ mod command;
 mod engine_state;
 mod overlay;
 mod pattern_match;
+mod register_buf_cache;
 mod stack;
 mod stack_out_dest;
 mod state_delta;
@ -20,6 +21,7 @@ pub use command::*;
 pub use engine_state::*;
 pub use overlay::*;
 pub use pattern_match::*;
+pub use register_buf_cache::*;
 pub use stack::*;
 pub use stack_out_dest::*;
 pub use state_delta::*;
--- a/crates/nu-protocol/src/engine/register_buf_cache.rs
+++ b/crates/nu-protocol/src/engine/register_buf_cache.rs
@ -0,0 +1,62 @@
+use std::fmt;
+
+use crate::PipelineData;
+
+/// Retains buffers for reuse in IR evaluation, avoiding heap allocation.
+///
+/// This is implemented in such a way that [`Clone`] is still possible, by making the fact that the
+/// buffers can't be preserved on clone completely transparent. The cached buffers are always empty.
+pub struct RegisterBufCache {
+    bufs: Vec<Vec<PipelineData>>,
+}
+
+// SAFETY: because `bufs` only ever contains empty `Vec`s, it doesn't actually contain any of the
+// data.
+unsafe impl Send for RegisterBufCache {}
+unsafe impl Sync for RegisterBufCache {}
+
+impl RegisterBufCache {
+    /// Create a new cache with no register buffers.
+    pub const fn new() -> Self {
+        RegisterBufCache { bufs: vec![] }
+    }
+
+    /// Acquire a new register buffer from the cache. The buffer will be extended to `size` with
+    /// [`Empty`](PipelineData::Empty) elements.
+    pub fn acquire(&mut self, size: usize) -> Vec<PipelineData> {
+        let mut buf = if let Some(buf) = self.bufs.pop() {
+            debug_assert!(buf.is_empty());
+            buf
+        } else {
+            Vec::new()
+        };
+        buf.reserve(size);
+        buf.extend(std::iter::repeat_with(|| PipelineData::Empty).take(size));
+        buf
+    }
+
+    /// Release a used register buffer to the cache. The buffer will be cleared.
+    pub fn release(&mut self, mut buf: Vec<PipelineData>) {
+        // SAFETY: this `clear` is necessary for the `unsafe impl`s to be safe
+        buf.clear();
+        self.bufs.push(buf);
+    }
+}
+
+impl fmt::Debug for RegisterBufCache {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let bufs = self.bufs.len();
+        let bytes: usize = self
+            .bufs
+            .iter()
+            .map(|b| b.capacity() * std::mem::size_of::<PipelineData>())
+            .sum();
+        write!(f, "RegisterBufCache({bufs} bufs, {bytes} bytes)")
+    }
+}
+
+impl Clone for RegisterBufCache {
+    fn clone(&self) -> Self {
+        RegisterBufCache::new()
+    }
+}
--- a/crates/nu-protocol/src/engine/stack.rs
+++ b/crates/nu-protocol/src/engine/stack.rs
@ -11,6 +11,8 @@ use std::{
    sync::Arc,
 };

+use super::RegisterBufCache;
+
 /// Environment variables per overlay
 pub type EnvVars = HashMap<String, HashMap<String, Value>>;

@ -41,6 +43,8 @@ pub struct Stack {
    pub env_hidden: HashMap<String, HashSet<String>>,
    /// List of active overlays
    pub active_overlays: Vec<String>,
+    /// Cached register buffers for IR evaluation
+    pub register_buf_cache: RegisterBufCache,
    pub recursion_count: u64,
    pub parent_stack: Option<Arc<Stack>>,
    /// Variables that have been deleted (this is used to hide values from parent stack lookups)
@ -68,6 +72,7 @@ impl Stack {
            env_vars: Vec::new(),
            env_hidden: HashMap::new(),
            active_overlays: vec![DEFAULT_OVERLAY_NAME.to_string()],
+            register_buf_cache: RegisterBufCache::new(),
            recursion_count: 0,
            parent_stack: None,
            parent_deletions: vec![],
@ -85,6 +90,7 @@ impl Stack {
            env_vars: parent.env_vars.clone(),
            env_hidden: parent.env_hidden.clone(),
            active_overlays: parent.active_overlays.clone(),
+            register_buf_cache: RegisterBufCache::new(),
            recursion_count: parent.recursion_count,
            vars: vec![],
            parent_deletions: vec![],
@ -254,6 +260,7 @@ impl Stack {
            env_vars,
            env_hidden: self.env_hidden.clone(),
            active_overlays: self.active_overlays.clone(),
+            register_buf_cache: RegisterBufCache::new(),
            recursion_count: self.recursion_count,
            parent_stack: None,
            parent_deletions: vec![],
@ -284,6 +291,7 @@ impl Stack {
            env_vars,
            env_hidden: self.env_hidden.clone(),
            active_overlays: self.active_overlays.clone(),
+            register_buf_cache: RegisterBufCache::new(),
            recursion_count: self.recursion_count,
            parent_stack: None,
            parent_deletions: vec![],