import type { ABI, ABIFunction, ABIEvent, StateMutability } from "./abi.js"; import type { OpCode } from "./opcodes.js"; import type { ProxyResolver } from "./proxies.js"; import { hexToBytes, bytesToHex } from "./utils.js"; import { opcodes, pushWidth, isPush, isLog, isHalt, isCompare } from "./opcodes.js"; import { slotResolvers, SequenceWalletProxyResolver, FixedProxyResolver } from "./proxies.js"; function valueToOffset(value: Uint8Array): number { // FIXME: Should be a cleaner way to do this... return parseInt(bytesToHex(value), 16); } // BytecodeIter takes EVM bytecode and handles iterating over it with correct // step widths, while tracking N buffer of previous offsets for indexed access. // This is useful for checking against sequences of variable width // instructions. export class BytecodeIter { bytecode: Uint8Array; nextStep: number; // Instruction count nextPos: number; // Byte-wise instruction position (takes variable width into account) // TODO: Could improve the buffer by making it sparse tracking of only // variable-width (PUSH) instruction indices, this would allow relatively // efficient seeking to arbitrary positions after a full iter. Then again, // roughly 1/4 of instructions are PUSH, so maybe it doesn't help enough? posBuffer: number[]; // Buffer of positions posBufferSize: number; constructor(bytecode: string, config?: { bufferSize?: number }) { this.nextStep = 0; this.nextPos = 0; if (config === undefined) config = {}; this.posBufferSize = Math.max(config.bufferSize || 1, 1); this.posBuffer = []; this.bytecode = hexToBytes(bytecode); } hasMore(): boolean { return (this.bytecode.length > this.nextPos); } next(): OpCode { if (this.bytecode.length <= this.nextPos) return opcodes.STOP; const instruction = this.bytecode[this.nextPos]; const width = pushWidth(instruction); // TODO: Optimization: Could use a circular buffer if (this.posBuffer.length >= this.posBufferSize) this.posBuffer.shift(); this.posBuffer.push(this.nextPos); this.nextStep += 1; this.nextPos += 1 + width; return instruction; } // step is the current instruction position that we've iterated over. If // iteration has not begun, then it's -1. step(): number { return this.nextStep - 1; } // pos is the byte offset of the current instruction we've iterated over. // If iteration has not begun then it's -1. pos(): number { if (this.posBuffer.length === 0) return -1; return this.posBuffer[this.posBuffer.length - 1]; } // asPos returns an absolute position for a given position that could be relative. // Returns -1 if out of bounds. asPos(posOrRelativeStep: number): number { let pos = posOrRelativeStep; if (pos < 0) { pos = this.posBuffer[this.posBuffer.length + pos]; if (pos === undefined) { return -1; } } return pos; } // at returns instruction at an absolute byte position or relative negative // buffered step offset. Buffered step offsets must be negative and start // at -1 (current step). at(posOrRelativeStep: number): OpCode { const pos = this.asPos(posOrRelativeStep); return this.bytecode[pos]; } // value of last next-returned OpCode (should be a PUSHN intruction) value(): Uint8Array { return this.valueAt(-1); } // valueAt returns the variable width value for PUSH-like instructions (or // empty value otherwise), at pos pos can be a relative negative count for // relative buffered offset. valueAt(posOrRelativeStep: number): Uint8Array { const pos = this.asPos(posOrRelativeStep); const instruction = this.bytecode[pos]; const width = pushWidth(instruction); return this.bytecode.slice(pos + 1, pos + 1 + width); } } // Opcodes that tell us something interesting about the function they're in const interestingOpCodes : Set = new Set([ opcodes.STOP, // No return value opcodes.RETURN, // Has return value? opcodes.CALLDATALOAD, // Has arguments opcodes.CALLDATASIZE, // FIXME: Is it superfluous to have these two? opcodes.CALLDATACOPY, opcodes.DELEGATECALL, // We use this to detect proxies opcodes.SLOAD, // Not pure opcodes.SSTORE, // Not view opcodes.REVERT, opcodes.CREATE, // Factory opcodes.CREATE2, // Factory // TODO: Add LOGs to track event emitters? ]); export class Function { byteOffset: number; // JUMPDEST byte offset opTags: Set; // Track whether function uses interesting opcodes start: number; // JUMPDEST instruction offset jumps: Array; // JUMPDEST instruction offsets this function can jump to end?: number; // Last instruction offset before the next JUMPDEST constructor(byteOffset: number = 0, start: number = 0) { this.byteOffset = byteOffset; this.start = start; this.opTags = new Set(); this.jumps = []; } } export class Program { dests: { [key: number]: Function }; // instruction offset -> Function selectors: { [key: string]: number }; // function hash -> instruction offset notPayable: { [key: number]: number }; // instruction offset -> bytes offset fallback?: number; // instruction offset for fallback function eventCandidates: Set; // PUSH32 found before a LOG instruction proxySlots: Array; // PUSH32 found that match known proxy slots proxies: Array; isFactory: boolean; // CREATE or CREATE2 detected sstoreCount: number; // Number of SSTORE instructions detected, used to determine if this may be a destination contract (vs a proxy) init?: Program; // Program embedded as init code constructor(init?: Program) { this.dests = {}; this.selectors = {}; this.notPayable = {}; this.eventCandidates = new Set(); this.proxySlots = []; this.proxies = []; this.isFactory = false; this.sstoreCount = 0; this.init = init; } } export function abiFromBytecode(bytecodeOrProgram: string|Program): ABI { const p = typeof bytecodeOrProgram === "string" ? disasm(bytecodeOrProgram) : bytecodeOrProgram; const abi: ABI = []; for (const [selector, offset] of Object.entries(p.selectors)) { // TODO: Optimization: If we only look at selectors in the jump table region, we shouldn't need to check JUMPDEST validity. if (!(offset in p.dests)) { // Selector does not point to a valid jumpdest. This should not happen. continue; } // Collapse tags for function call graph const fn = p.dests[offset]; const tags = subtreeTags(fn, p.dests); const funcABI = { type: "function", selector: selector, payable: !p.notPayable[offset], } as ABIFunction; // Note that these are not very reliable because our tag detection // fails to follow dynamic jumps. let mutability : StateMutability = "nonpayable"; if (funcABI.payable) { mutability = "payable"; } else if (!tags.has(opcodes.SSTORE)) { mutability = "view"; } // TODO: Can we make a claim about purity? Probably not reliably without handling dynamic jumps? // if (mutability === "view" && !tags.has(opcodes.SLOAD)) { // mutability = "pure"; // } funcABI.stateMutability = mutability; // Unfortunately we don't have better details about the type sizes, so we just return a dynamically-sized /shrug if (tags.has(opcodes.RETURN) || mutability === "view") { // FIXME: We assume outputs based on mutability, that's a hack. funcABI.outputs = [{type: "bytes", name: ""}]; } if (tags.has(opcodes.CALLDATALOAD) || tags.has(opcodes.CALLDATASIZE) || tags.has(opcodes.CALLDATACOPY)) { funcABI.inputs = [{type: "bytes", name: ""}]; } abi.push(funcABI); } for (const h of p.eventCandidates) { abi.push({ type: "event", hash: h, } as ABIEvent); } return abi; } const _EmptyArray = new Uint8Array(); export function disasm(bytecode: string, config?: {onlyJumpTable: boolean}): Program { const { onlyJumpTable } = config || {}; let p : Program = new Program(); const selectorDests = new Set(); let invertedSelector = ""; let lastPush32: Uint8Array = _EmptyArray; // Track last push32 to find log topics let checkJumpTable: boolean = true; let resumeJumpTable = new Set(); let maxJumpDest = 0; let runtimeOffset = 0; // Non-zero if init deploy code is included let boundaryPos = -1; let currentFunction: Function = new Function(); p.dests[0] = currentFunction; const code = new BytecodeIter(bytecode, { bufferSize: 5 }); while (code.hasMore()) { const inst = code.next(); const pos = code.pos(); const step = code.step(); // Track last PUSH32 to find LOG topics // This is probably not bullet proof but seems like a good starting point if (inst === opcodes.PUSH32) { const v = code.value(); const resolver = slotResolvers[bytesToHex(v)]; if (resolver !== undefined && !(p.proxies.length > 0 && p.proxies[0].name === resolver.name)) { // While we're looking at PUSH32, let's find proxy slots, but avoid continuous dupes based off of slots // (Dupes aren't a huge deal, just cleaning up a bit, can be more comprehensive in the future.) p.proxies.push(resolver); } else { lastPush32 = v; } continue } else if (isLog(inst) && lastPush32.length > 0) { p.eventCandidates.add(bytesToHex(lastPush32)); continue } // Possible minimal proxy pattern? EIP-1167 if (inst === opcodes.DELEGATECALL && code.at(-2) === opcodes.GAS) { if (isPush(code.at(-3))) { // Hardcoded delegate address // TODO: We can probably do more here to determine which kind? Do we care? const val = code.valueAt(-3); const addr = bytesToHex( val.slice(val.length-20), // Might be padded with zeros 20, ); // Check if proxy is already found. Rare to have more than 1, so we'll just O(N) here if (!p.proxies.find(p => p instanceof FixedProxyResolver && p.resolvedAddress === addr)) { p.proxies.push(new FixedProxyResolver("HardcodedDelegateProxy", addr)); } } else if ( code.at(-3) === opcodes.SLOAD && code.at(-4) === opcodes.ADDRESS ) { // SequenceWallet-style proxy (keyed on address) p.proxies.push(new SequenceWalletProxyResolver()); } } // Find JUMPDEST labels if (inst === opcodes.JUMPDEST) { // End of the function, or disjoint function? if (isHalt(code.at(-2)) || code.at(-2) === opcodes.JUMP) { if (currentFunction) currentFunction.end = pos - 1 - runtimeOffset; currentFunction = new Function(step, pos); // We don't stop looking for jump tables until we find at least one selector if (checkJumpTable && Object.keys(p.selectors).length > 0) { checkJumpTable = false; } if (resumeJumpTable.delete(pos)) { // Continuation of a previous jump table? // Selector branch trees start by pushing CALLDATALOAD or it was pushed before. checkJumpTable = code.at(pos + 1) === opcodes.DUP1 || code.at(pos + 1) === opcodes.CALLDATALOAD; } else if (!checkJumpTable && resumeJumpTable.size === 0 && onlyJumpTable) { // Exit early if we're only looking in the jump table break; } } // Otherwise it's just a simple branch, we continue // Index jump destinations so we can check against them later p.dests[pos - runtimeOffset] = currentFunction; // Check whether a JUMPDEST has non-payable guards // // We look for a sequence of instructions that look like: // JUMPDEST CALLVALUE DUP1 ISZERO // // We can do direct positive indexing because we know that there // are no variable-width instructions in our sequence. if ( code.at(pos + 1) === opcodes.CALLVALUE && code.at(pos + 2) === opcodes.DUP1 && code.at(pos + 3) === opcodes.ISZERO ) { p.notPayable[pos] = step; // TODO: Optimization: Could seek ahead 3 pos/count safely } // TODO: Check whether function has a simple return flow? // if (code.at(pos - 1) === opcodes.RETURN) { ... } continue; } else if ( (isHalt(code.at(-2)) || code.at(-2) === opcodes.JUMP) && runtimeOffset < pos && maxJumpDest < pos ) { // Did we just find the end of the program? boundaryPos = pos; break; } // Annotate current function if (currentFunction.opTags !== undefined) { // Detect simple JUMP/JUMPI helper subroutines if ((inst === opcodes.JUMP || inst === opcodes.JUMPI) && isPush(code.at(-2))) { const jumpOffset = valueToOffset(code.valueAt(-2)); currentFunction.jumps.push(jumpOffset - runtimeOffset); maxJumpDest = Math.max(maxJumpDest, jumpOffset); } // Tag current function with interesting opcodes (not including above) if (interestingOpCodes.has(inst)) { currentFunction.opTags.add(inst); // CREATE/CREATE2 are part of interestingOpCodes so we can leverage this short circuit if (inst === opcodes.CREATE || inst === opcodes.CREATE2) { p.isFactory = true; } else if (inst === opcodes.SSTORE) { p.sstoreCount += 1; } } } // Did we just copy code that might be the runtime code? // PUSH2 PUSH1 0x00 CODECOPY if (code.at(-1) === opcodes.CODECOPY && ( // Add 0x00 to stack code.at(-2) === opcodes.PUSH1 || code.at(-2) === opcodes.PUSH0 || code.at(-2) === opcodes.RETURNDATASIZE ) && isPush(code.at(-3)) ) { const offsetDest: number = valueToOffset(code.valueAt(-3)); resumeJumpTable.add(offsetDest); runtimeOffset = offsetDest; continue; } if (pos === runtimeOffset && currentFunction.opTags.has(opcodes.RETURN) && !currentFunction.opTags.has(opcodes.CALLDATALOAD) ) { // Reset state, embed program as init p = new Program(p); currentFunction = new Function(); p.dests[0] = currentFunction; checkJumpTable = true; } if (!checkJumpTable) continue; // Skip searching for function selectors at this point // We're in a jump table section now. Let's find some selectors. if (inst === opcodes.JUMP && isPush(code.at(-2))) { const offsetDest: number = valueToOffset(code.valueAt(-2)); if (invertedSelector !== "") { p.selectors[invertedSelector] = offsetDest; selectorDests.add(offsetDest); invertedSelector = ""; } else { // The table is continued elsewhere? Or could be a default target resumeJumpTable.add(offsetDest); } } // Beyond this, we're only looking with instruction sequences that end with // ... PUSHN JUMPI if (!( code.at(-1) === opcodes.JUMPI && isPush(code.at(-2)) )) continue; const offsetDest: number = valueToOffset(code.valueAt(-2)); currentFunction.jumps.push(offsetDest); // Find callable function selectors: // // https://github.com/ethereum/solidity/blob/242096695fd3e08cc3ca3f0a7d2e06d09b5277bf/libsolidity/codegen/ContractCompiler.cpp#L333 // // We're looking for a sequence of opcodes that looks like: // // DUP1 PUSH4 0x2E64CEC1 EQ PUSH1 0x37 JUMPI // DUP1 PUSH4 EQ PUSHN JUMPI // 80 63 ^ 14 60-7f ^ 57 // Selector Dest // // We can reliably skip checking for DUP1 if we're only searching // within `inJumpTable` range. // // Note that sizes of selectors and destinations can vary. Selector // PUSH can get optimized with zero-prefixes, all the way down to an // ISZERO routine (see next condition block). if ( code.at(-3) === opcodes.EQ && isPush(code.at(-4)) ) { // Found a function selector sequence, save it to check against JUMPDEST table later let value = code.valueAt(-4); // 0-prefixed comparisons get optimized to a smaller width than PUSH4 const selector: string = bytesToHex(value, 4); p.selectors[selector] = offsetDest; selectorDests.add(offsetDest); continue; } // Sometimes the positions get swapped with DUP2: // PUSHN DUP2 EQ PUSHN JUMPI if ( code.at(-3) === opcodes.EQ && code.at(-4) === opcodes.DUP2 && isPush(code.at(-5)) ) { // Found a function selector sequence, save it to check against JUMPDEST table later let value = code.valueAt(-5) // 0-prefixed comparisons get optimized to a smaller width than PUSH4 const selector: string = bytesToHex(value, 4); p.selectors[selector] = offsetDest; selectorDests.add(offsetDest); continue; } // Sometimes the final selector is negated using SUB // PUSHN SUB PUSHN JUMPI ... // ^ We are here (at -1) if ( code.at(-3) === opcodes.SUB && isPush(code.at(-4)) ) { // Found an inverted JUMPI, probably a final "else" condition invertedSelector = bytesToHex(code.valueAt(-4), 4); p.fallback = offsetDest; // Fallback } // In some cases, the sequence can get optimized such as for 0x00000000: // DUP1 ISZERO PUSHN JUMPI // FIXME: Need a better heuristic to descriminate the preceding value. This is hacky. :( if ( code.at(-3) === opcodes.ISZERO && code.at(-4) === opcodes.DUP1 && ( code.at(-5) === opcodes.CALLDATASIZE || code.at(-5) === opcodes.CALLDATALOAD || code.at(-5) === opcodes.JUMPDEST || code.at(-5) === opcodes.SHR ) ) { const selector = "0x00000000"; p.selectors[selector] = offsetDest; selectorDests.add(offsetDest); continue; } // Jumptable trees use GT/LT comparisons to branch jumps. // DUP1 PUSHN GT/LT PUSHN JUMPI if ( code.at(-3) !== opcodes.EQ && isCompare(code.at(-3)) && code.at(-5) === opcodes.DUP1 ) { resumeJumpTable.add(offsetDest); continue; } } if ((boundaryPos > 0 && p.proxies.length === 0)) { // Slots could be stored outside of the program boundary and copied in // and we haven't found any proxy slots yet so let's check just in case... // This is unstructured data, so it could be anything. We can't parse it reliably. // We can skip the CBOR encoding based on length defined in the final byte // https://playground.sourcify.dev/ // TODO: Pull CBOR out and add to result let endBoundary : number|undefined = undefined; const finalByte = code.bytecode.slice(-1)[0]; if (!isHalt(finalByte)) { const cborLength = valueToOffset(code.bytecode.slice(-2)); endBoundary = -(cborLength + 4); // +4 for the length bytes } const auxData = bytesToHex(code.bytecode.slice(boundaryPos, endBoundary)); if (auxData.length >= 2 + 64) { // 0x is empty, plus at least enough for a slot // Look for known slots in extra data segment that could be CODECOPY'd for (const [slot, resolver] of Object.entries(slotResolvers)) { if (auxData.lastIndexOf(slot.slice(2)) === -1) continue; p.proxies.push(resolver); } } } if (p.init && p.proxies.length === 0 && p.init.proxies.length > 0) { // If we detected a proxy in init but not in the deploy code, we're going to inherit it for now. // There is a scenario where we have trouble detecting a flattened immutable EIP1967 otherwise. // Hopefully we can improve this later. // See: https://github.com/shazow/whatsabi/issues/171 p.proxies = p.init.proxies; } return p; } function subtreeTags(entryFunc: Function, dests: { [key: number]: Function }): Set { let tags = new Set([]); const stack = new Array(entryFunc); const seen = new Set(); while (stack.length > 0) { const fn = stack.pop(); if (!fn) continue; if (seen.has(fn.start)) continue; seen.add(fn.start); tags = new Set([...tags, ...fn.opTags]); stack.push(...fn.jumps.map(offset => dests[offset])) } return tags; }