C# のコードに x86/x86-64 命令を直接組み込む

C# で書かれた将棋の思考ルーチンの高速化のため,(Visual C++ 用の) 組み込み関数 _mm_prefetch 的なものを使うべく,ネイティブコードで書かれた DLL と C# で書かれたメインの思考ルーチンを組み合わせてみた,というお話.ふむふむ.

ざっと眺めて C# のみで書けそうだったので,気分転換も兼ねて書いてみました.個人的には単一の(メタ)言語で完結するプロジェクトが好きです.配布するファイルの数が減るのはインストール・アンインストール作業やバージョン管理が楽になります.Visual Studio で複数言語を混在させると,Express Edition の人にビルドしてもらうとき困ったりするというのもがあります.ビルドシステムは単純な方がいいですよ.ほんと.とまあこの辺りが書いてみようと思った主な理由でしょうか.
さて,以下コードが整理されていないので読みにくいですが,基本的には,

  1. VirtualAlloc で領域を確保し,そこに使いたい関数を書き込む
  2. VirtualProtect で保護属性を PAGE_EXECUTE に変更する *1
  3. Marshal.GetDelegateForFunctionPointer に関数の先頭アドレスを渡して .NET デリゲートに変換する

という流れです.
以下のコードは,ak11 さんの記事 と同じく,Prefetch128, Prefetch256, cpuid の 3 つを作成し,C# コードから呼び出しています.呼び出される関数の内容は事前に作ったものですが,実行環境によって x86 用と x86-64 用の関数を使い分けています.なお,Itanium 等その他の CPU には対応しておりません.
余談ですが,原理上は生成される関数自体をプログラムで制御してしまうことも可能です.今回はそこまでやっていませんが,もしその手の動的コード生成の世界に挑戦するのであれば,Xbyak が参考になるかと思います.

// Windows applications may or may not be originally written in Objective-C,
// C, C++, or JavaScript as executed by the JScript engine, and not only code
// written in C, C++, and Objective-C but also code written in other languages
// can compile and directly link against the Documented APIs.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Security;
using System.Security.Permissions;

namespace Win32
{
  internal enum ProcessorArchitecture : ushort
  {
    PROCESSOR_ARCHITECTURE_AMD64 = 9,
    PROCESSOR_ARCHITECTURE_IA64 = 6,
    PROCESSOR_ARCHITECTURE_INTEL = 0,
    PROCESSOR_ARCHITECTURE_UNKNOWN = 0xffff,
  }
  internal enum ProcessorType : uint
  {
    PROCESSOR_INTEL_386 = 386,
    PROCESSOR_INTEL_486 = 486,
    PROCESSOR_INTEL_PENTIUM = 586,
    PROCESSOR_INTEL_IA64 = 2200,
    PROCESSOR_AMD_X8664 = 8664,
  }
  [Flags]
  internal enum VirtualAllocType : uint
  {
    MEM_COMMIT = 0x1000,
    MEM_RESERVE = 0x2000,
    MEM_RESET = 0x80000,
    MEM_LARGE_PAGES = 0x20000000,
    MEM_PHYSICAL = 0x400000,
    MEM_TOP_DOWN = 0x100000,
    MEM_WRITE_WATCH = 0x200000,
  }
  [Flags]
  internal enum VirtualFreeType : uint
  {
    MEM_DECOMMIT = 0x4000,
    MEM_RELEASE = 0x8000,
  }
  [Flags]
  internal enum MemoryProtectionType : uint
  {
    PAGE_NOACCESS = 0x01,
    PAGE_READONLY = 0x02,
    PAGE_READWRITE = 0x04,
    PAGE_WRITECOPY = 0x08,
    PAGE_EXECUTE = 0x10,
    PAGE_EXECUTE_READ = 0x20,
    PAGE_EXECUTE_READWRITE = 0x40,
    PAGE_EXECUTE_WRITECOPY = 0x80,
    PAGE_GUARD = 0x100,
    PAGE_NOCACHE = 0x200,
    PAGE_WRITECOMBINE = 0x400
  }
  [StructLayout(LayoutKind.Sequential, Pack = 2)]
  internal struct SystemInfo
  {
    public ProcessorArchitecture ProcessorArchitecture;
    ushort Reserved;
    public uint PageSize;
    public IntPtr MinimumApplicationAddress;
    public IntPtr MaximumApplicationAddress;
    public IntPtr ActiveProcessorMask;
    public uint NumberOfProcessors;
    public ProcessorType ProcessorType;
    public uint AllocationGranularity;
    public ushort ProcessorLevel;
    public ushort ProcessorRevision;
  }
  [SecurityPermission(SecurityAction.LinkDemand, UnmanagedCode = true)]
  internal class VirtualAllocRegion : SafeHandle
  {
    private VirtualAllocRegion()
      : base(IntPtr.Zero, true)
    {
    }
    public override bool IsInvalid
    {
      get { return handle == IntPtr.Zero; }
    }
    protected override bool ReleaseHandle()
    {
      return NativeMethods.VirtualFree(handle, (UIntPtr)0, VirtualFreeType.MEM_RELEASE);
    }
  }
  internal static class NativeMethods
  {
    [SuppressUnmanagedCodeSecurityAttribute]
    [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, ExactSpelling = true)]
    extern static public IntPtr GetCurrentProcess();
    [SuppressUnmanagedCodeSecurityAttribute]
    [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = true)]
    [return: MarshalAs(UnmanagedType.Bool)]
    extern static public bool FlushInstructionCache(IntPtr processHandle, IntPtr address, uint regionSize);
    [SuppressUnmanagedCodeSecurityAttribute]
    [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = true)]
    extern static public void GetSystemInfo(out SystemInfo info);
    [SuppressUnmanagedCodeSecurityAttribute]
    [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = true)]
    extern static public VirtualAllocRegion VirtualAlloc(
        IntPtr address, UIntPtr size, VirtualAllocType allocType, MemoryProtectionType protectionType);
    [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = true)]
    [return: MarshalAs(UnmanagedType.Bool)]
    extern static public bool VirtualFree(IntPtr address, UIntPtr size, VirtualFreeType allocType);
    [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = true)]
    [return: MarshalAs(UnmanagedType.Bool)]
    extern static public bool VirtualProtect(
        IntPtr address, UIntPtr size, MemoryProtectionType protectionType,
        out MemoryProtectionType oldProtectionType);
  }

  [StructLayout(LayoutKind.Sequential, Pack = 4)]
  public struct CPUInfo
  {
    public uint eax;
    public uint ebx;
    public uint ecx;
    public uint edx;
  }

  public static unsafe class Prefetcher
  {
    [UnmanagedFunctionPointer(CallingConvention.StdCall)]
    [SuppressUnmanagedCodeSecurityAttribute]
    private delegate void PrefetcherDelegate(void* address);
    [UnmanagedFunctionPointer(CallingConvention.StdCall)]
    [SuppressUnmanagedCodeSecurityAttribute]
    private delegate void CPUIDDelegate([In, Out] ref CPUInfo info);

    private static void DummyPrefetcher(void* address) { }
    private static void DummyCPUID(ref CPUInfo info) { }
    private static PrefetcherDelegate prefetch128_ = DummyPrefetcher;
    private static PrefetcherDelegate prefetch256_ = DummyPrefetcher;
    private static CPUIDDelegate cpuid_ = DummyCPUID;

    private readonly static VirtualAllocRegion region_ = null;

    private static TDelegate CreateDelegate<TDelegate>(IntPtr base_address, int offset)
      where TDelegate : class
    {
      var address = IntPtr.Add(base_address, offset);
      return Marshal.GetDelegateForFunctionPointer(address, typeof(TDelegate)) as TDelegate;
    }

    static Prefetcher()
    {
      // Use GetSystemInfo API to determine the processor architecture.
      var system_info = default(SystemInfo);
      NativeMethods.GetSystemInfo(out system_info);
      var supported_architectures = new []
      {
        ProcessorArchitecture.PROCESSOR_ARCHITECTURE_INTEL,
        ProcessorArchitecture.PROCESSOR_ARCHITECTURE_AMD64,
      };
      if (!supported_architectures.Contains(system_info.ProcessorArchitecture))
      {
        // Unsupported architecture.
        return;
      }

      var x86 = new
      {
        CPUID = new byte[]
        {
          // void __declspec(noinline) __stdcall CPUID(CPUInfo* info);
          0x53,                    //  push        ebx
          0x57,                    //  push        edi
          0x8B, 0x7C, 0x24, 0x0C,  //  mov         edi,dword ptr [esp+0Ch]
          0x8B, 0x07,              //  mov         eax,dword ptr [edi]
          0x8B, 0x4F, 0x08,        //  mov         ecx,dword ptr [edi+8]
          0x0F, 0xA2,              //  cpuid
          0x89, 0x07,              //  mov         dword ptr [edi],eax
          0x89, 0x5F, 0x04,        //  mov         dword ptr [edi+4],ebx
          0x89, 0x4F, 0x08,        //  mov         dword ptr [edi+8],ecx
          0x89, 0x57, 0x0C,        //  mov         dword ptr [edi+0Ch],edx
          0x5F,                    //  pop         edi
          0x5B,                    //  pop         ebx
          0xC2, 0x04, 0x00,        //  ret         4
        },
        Prefetch128 = new byte[]
        {
          // void __declspec(noinline) __stdcall Prefetch128(void* ptr);
          0x8B, 0x4C, 0x24, 0x04,  // mov ecx, dword ptr [esp+4]
          0x0F, 0x18, 0x19,        // prefetcht2  [ecx]
          0x0F, 0x18, 0x59, 0x40,  // prefetcht2  [ecx+40h]
          0xC2, 0x04, 0x00,        // ret 4
        },
        Prefetch256 = new byte[]
        {
          // void __declspec(noinline) __stdcall Prefetch256(void* ptr);
          0x8B, 0x4C, 0x24, 0x04,                    // mov ecx, dword ptr [esp+4]
          0x0F, 0x18, 0x19,                          // prefetcht2  [ecx]
          0x0F, 0x18, 0x59, 0x40,                    // prefetcht2  [ecx+40h]
          0x0F, 0x18, 0x99, 0x80, 0x00, 0x00, 0x00,  // prefetcht2  [ecx+80h]
          0x0F, 0x18, 0x99, 0xC0, 0x00, 0x00, 0x00,  // prefetcht2  [ecx+0C0h]
          0xC2, 0x04, 0x00,                          // ret 4
        },
      };

      var x64 = new
      {
        CPUID = new byte[]
        {
          // void __declspec(noinline) CPUID(CPUInfo* info);
          0x4C, 0x8B, 0xCB,        // mov         r9,rbx
          0x4C, 0x8B, 0xC1,        // mov         r8,rcx
          0x41, 0x8B, 0x00,        // mov         eax,dword ptr [r8]
          0x41, 0x8B, 0x48, 0x08,  // mov         ecx,dword ptr [r8+8]
          0x0F, 0xA2,              // cpuid
          0x41, 0x89, 0x00,        // mov         dword ptr [r8],eax
          0x41, 0x89, 0x58, 0x04,  // mov         dword ptr [r8+4],ebx
          0x41, 0x89, 0x48, 0x08,  // mov         dword ptr [r8+8],ecx
          0x41, 0x89, 0x50, 0x0C,  // mov         dword ptr [r8+0Ch],edx
          0x4C, 0x89, 0xCB,        // mov         rbx,r9
          0xC3,                    // ret
        },
        Prefetch128 = new byte[]
        {
          // void __declspec(noinline) Prefetch128(void* ptr);
          0x0F, 0x18, 0x19,        // prefetcht2  [rcx]
          0x0F, 0x18, 0x59, 0x40,  // prefetcht2  [rcx+40h]
          0xC3,                    // ret
        },
        Prefetch256 = new byte[]
        {
          // void __declspec(noinline) Prefetch256(void* ptr);
          0x0F, 0x18, 0x19,                          // prefetcht2  [rcx]
          0x0F, 0x18, 0x59, 0x40,                    // prefetcht2  [rcx+40h]
          0x0F, 0x18, 0x99, 0x80, 0x00, 0x00, 0x00,  // prefetcht2  [rcx+80h]
          0x0F, 0x18, 0x99, 0xC0, 0x00, 0x00, 0x00,  // prefetcht2  [rcx+0C0h]
          0xC3,                                      // ret
        },
      };

      var target = Environment.Is64BitProcess ? x64 : x86;

      // Align 8-byte boundary with the specified padding data.
      var align8 = (Func<byte[], byte, byte[]>)(
        (array, paddingData) => array.Concat(Enumerable.Repeat(paddingData, int.MaxValue))
                                     .Take((array.Length + 7) & ~7).ToArray());

      const byte int3 = 0xcc;
      var cpuid = align8(target.CPUID, int3);
      var prefetch128 = align8(target.Prefetch128, int3);
      var prefetch256 = align8(target.Prefetch256, int3);

      var data = prefetch128.Concat(prefetch256)
                            .Concat(cpuid)
                            .ToArray();
      var offset = new
      {
        Prefetch128 = 0,
        Prefetch256 = prefetch128.Length,
        CPUID = prefetch128.Length + prefetch256.Length,
      };

      try
      {
        region_ = NativeMethods.VirtualAlloc(
            IntPtr.Zero, (UIntPtr)data.Length, VirtualAllocType.MEM_COMMIT,
            MemoryProtectionType.PAGE_READWRITE);
        if (region_.IsInvalid)
        {
          return;
        }

        var addr = region_.DangerousGetHandle();
        Marshal.Copy(data, 0, addr, data.Length);
        var oldType = default(MemoryProtectionType);
        var succeeded = NativeMethods.VirtualProtect(
            addr, (UIntPtr)data.Length, MemoryProtectionType.PAGE_EXECUTE, out oldType);
        if (!succeeded)
        {
          GlobalDispose();
          return;
        }

        // GetCurrentProcess returns a pseudo handle.
        // You need not to free a pseudo handle by ClodeHandle.
        var pseudoHandle = NativeMethods.GetCurrentProcess();
        succeeded = NativeMethods.FlushInstructionCache(pseudoHandle, addr, (uint)data.Length);
        if (!succeeded)
        {
          GlobalDispose();
          return;
        }

        prefetch128_ = CreateDelegate<PrefetcherDelegate>(addr, offset.Prefetch128);
        prefetch256_ = CreateDelegate<PrefetcherDelegate>(addr, offset.Prefetch256);
        cpuid_ = CreateDelegate<CPUIDDelegate>(addr, offset.CPUID);
      }
      catch
      {
        GlobalDispose();
        throw;
      }
    }
    public static CPUInfo CPUID(uint type)
    {
      return CPUID(type, 0);
    }
    public static CPUInfo CPUID(uint type, uint sub_type)
    {
      var info = default(CPUInfo);
      info.eax = type;
      info.ecx = sub_type;
      cpuid_(ref info);
      return info;
    }
    public static void Prefetch128(void* address)
    {
      prefetch128_(address);
    }
    public static void Prefetch256(void* address)
    {
      prefetch256_(address);
    }
    // This method is not thread-safe.
    public static void GlobalDispose()
    {
      cpuid_ = DummyCPUID;
      prefetch128_ = DummyPrefetcher;
      prefetch256_ = DummyPrefetcher;
      if (region_ != null && !region_.IsInvalid) { region_.Dispose(); }
    }
  }
}

static class Program
{
  static unsafe void Main(string[] args)
  {
    var info = Win32.Prefetcher.CPUID(0);
    if (info.eax < 1)
    {
      return;
    }

    info = Win32.Prefetcher.CPUID(1);
    var HasMMX = (info.edx & (1 << 23)) != 0;
    var HasSSE = (info.edx & (1 << 25)) != 0;
    var HasSSE2 = (info.edx & (1 << 26)) != 0;
    var HasSSE3 = (info.ecx & (1 << 0)) != 0;

    var buffer = new byte[1024];
    fixed (byte* ptr = buffer)
    {
      Win32.Prefetcher.Prefetch128(ptr);
      Win32.Prefetcher.Prefetch256(ptr);
    }
    Win32.Prefetcher.GlobalDispose();
  }
}

*1:[http://msdn.microsoft.com/en-us/library/aa366599.aspx:title=HeapCreate] + HEAP_CREATE_ENABLE_EXECUTE でも良かったのですが,最終的に書き込み可能属性を落としたかったので今回は VirtualAlloc + VirtualProtect を使いました