mod/modfast.hpp

View this file on GitHub
Last update: 2025-09-01 16:03:58+09:00
Include: #include "mod/modfast.hpp"

Depends on

Verified with

test/1_mytest/modfast.test.cpp

Code

#include "mod/primitive_root.hpp"
#include "nt/lpf_table.hpp"
#include "ds/hashmap.hpp"

template <int p>
struct ModFast {
  static_assert(p < (1 << 30));

  u32 root;
  array<u32, 65537> POW[2];
  array<pair<u16, u16>, 1 + (1 << 20)> FRAC;

  static constexpr int K = 1 << 21;
  array<u32, 2 * K + 1> LOG;
  array<u32, 2 * K + 1> INV;

  ModFast() {
    root = (p == 998244353 ? 3 : primitive_root(p));
    build_pow();
    build_inv();
    build_log();
    build_frac();
  }

  void build_inv() {
    INV[K + 1] = 1;
    for (u32 i = 2; i <= K; ++i) {
      u64 q = (p + i - 1) / i;
      INV[K + i] = INV[K + i * q - p] * u64(q) % p;
    }
    FOR(i, 1, K + 1) INV[K - i] = p - INV[K + i];
  }

  u32 pow(u32 a, ll exp) {
    assert(0 <= a && a < p && 0 <= exp && exp < (1 << 30));
    if (a == 0) return (exp == 0 ? 1 : 0);
    return pow_r_32(log_r(a) * exp % (p - 1));
  }

  u32 pow_r_32(u32 exp) {
    assert(0 <= exp && exp <= p - 1);
    return u64(POW[0][exp & 32767]) * POW[1][exp >> 15] % p;
  }
  u32 pow_r(ll exp) {
    exp %= p - 1;
    if (exp < 0) exp += p - 1;
    return u64(POW[0][exp & 32767]) * POW[1][exp >> 15] % p;
  }

  // [0, 2p-2)
  u32 log_r(u32 x) {
    assert(1 <= x && x < p);
    auto [a, b] = FRAC[x >> 10];
    u32 t = x * b - a * p;
    return LOG[K + t] + (p - 1) - LOG[K + b];
  }

  u32 inverse(u32 x) {
    assert(1 <= x && x < p);
    auto [a, b] = FRAC[x >> 10];
    u32 t = x * b - a * p;
    return INV[K + t] * u64(b) % p;
  }

  template <typename T>
  vc<T> get_log_table(int n) {
    assert(n <= K);
    return {LOG.begin() + K, LOG.begin() + K + n + 1};
  }

 private:
  void build_pow() {
    POW[0][0] = POW[1][0] = 1;
    FOR(i, (1 << 15)) POW[0][i + 1] = POW[0][i] * u64(root) % p;
    FOR(i, (1 << 15)) POW[1][i + 1] = POW[1][i] * u64(POW[0][1 << 15]) % p;
  }

  // 0.085 sec.
  void build_log() {
    const int LIM = 1 << 21;
    auto lpf = lpf_table(LIM);

    const int S = 1 << 17;
    HashMap<u32> MP(S);
    u32 pw = 1;
    for (int k = 0; k < S; ++k, pw = u64(root) * pw % p) {
      MP[pw] = k;
    }
    u32 q = pow_r_32(p - 1 - S);
    auto BSGS = [&](u32 s) -> u32 {
      u32 ans = 0;
      while (1) {
        u32 v = MP.get(s, -1);
        if (v != u32(-1)) {
          return ans + v;
        }
        ans += S, s = u64(s) * q % p;
      }
      return 0;
    };

    LOG[K + 1] = 0;
    FOR(i, 2, 1 + (1 << 21)) {
      if (lpf[i] < i) {
        LOG[K + i] = (LOG[K + lpf[i]] + LOG[K + i / lpf[i]]) % (p - 1);
        continue;
      }
      if (i < 100) {
        LOG[K + i] = BSGS(i);
        continue;
      }
      if (i * i > p) {
        auto [j, k] = divmod<int>(p, i);
        // i = (-k)/j
        LOG[K + i] =
            (LOG[K + k] + (p - 1) / 2 + (p - 1) - LOG[K + j]) % (p - 1);
        continue;
      }
      while (1) {
        u32 k = RNG(0, p - 1);
        u64 ans = p - 1 - k;
        u32 x = u64(i) * pow_r_32(k) % p;
        auto div = [&](u32 q) -> void { x /= q, ans += LOG[K + q]; };
        for (u32 q : {2, 3, 5, 7, 11, 13, 17, 19}) {
          while (x % q == 0) div(q);
        }
        if (x >= LIM) continue;
        while (i < x && x < LIM && lpf[x] < i) div(lpf[x]);
        if (1 < x && x < i) div(x);
        if (x == 1) {
          LOG[K + i] = ans % (p - 1);
          break;
        }
      }
    }
    FOR(i, 1, 1 + (1 << 21)) {
      LOG[K - i] = (LOG[K + i] + (p - 1) / 2) % (p - 1);
    }
  }

  void build_frac() {
    vc<tuple<u16, u16, u16, u16>> que;
    que.eb(0, 1, 1, 1);
    while (len(que)) {
      auto [a, b, c, d] = POP(que);
      if (b + d < 2048) {
        que.eb(a + c, b + d, c, d), que.eb(a, b, a + c, b + d);
        continue;
      }
      u32 s = (u64(a) * p) / (1024 * b);
      u32 t = (u64(c) * p) / (1024 * d);
      FRAC[s] = {a, b}, FRAC[t] = {c, d};
      a = min(a, c), b = min(b, d);
      FOR(i, s + 1, t) FRAC[i] = {a, b};
    }
  }
};

#line 2 "mod/primitive_root.hpp"

#line 2 "nt/factor.hpp"

#line 2 "random/base.hpp"

u64 RNG_64() {
  static u64 x_ = u64(chrono::duration_cast<chrono::nanoseconds>(chrono::high_resolution_clock::now().time_since_epoch()).count()) * 10150724397891781847ULL;
  x_ ^= x_ << 7;
  return x_ ^= x_ >> 9;
}

u64 RNG(u64 lim) { return RNG_64() % lim; }

ll RNG(ll l, ll r) { return l + RNG_64() % (r - l); }
#line 2 "mod/mongomery_modint.hpp"

// odd mod.
// x の代わりに rx を持つ
template <int id, typename U1, typename U2>
struct Mongomery_modint {
  using mint = Mongomery_modint;
  inline static U1 m, r, n2;
  static constexpr int W = numeric_limits<U1>::digits;

  static void set_mod(U1 mod) {
    assert(mod & 1 && mod <= U1(1) << (W - 2));
    m = mod, n2 = -U2(m) % m, r = m;
    FOR(5) r *= 2 - m * r;
    r = -r;
    assert(r * m == U1(-1));
  }
  static U1 reduce(U2 b) { return (b + U2(U1(b) * r) * m) >> W; }

  U1 x;
  Mongomery_modint() : x(0) {}
  Mongomery_modint(U1 x) : x(reduce(U2(x) * n2)){};
  U1 val() const {
    U1 y = reduce(x);
    return y >= m ? y - m : y;
  }
  mint &operator+=(mint y) {
    x = ((x += y.x) >= m ? x - m : x);
    return *this;
  }
  mint &operator-=(mint y) {
    x -= (x >= y.x ? y.x : y.x - m);
    return *this;
  }
  mint &operator*=(mint y) {
    x = reduce(U2(x) * y.x);
    return *this;
  }
  mint operator+(mint y) const { return mint(*this) += y; }
  mint operator-(mint y) const { return mint(*this) -= y; }
  mint operator*(mint y) const { return mint(*this) *= y; }
  bool operator==(mint y) const {
    return (x >= m ? x - m : x) == (y.x >= m ? y.x - m : y.x);
  }
  bool operator!=(mint y) const { return not operator==(y); }
  mint pow(ll n) const {
    assert(n >= 0);
    mint y = 1, z = *this;
    for (; n; n >>= 1, z *= z)
      if (n & 1) y *= z;
    return y;
  }
};

template <int id>
using Mongomery_modint_32 = Mongomery_modint<id, u32, u64>;
template <int id>
using Mongomery_modint_64 = Mongomery_modint<id, u64, u128>;
#line 3 "nt/primetest.hpp"

bool primetest(const u64 x) {
  assert(x < u64(1) << 62);
  if (x == 2 or x == 3 or x == 5 or x == 7) return true;
  if (x % 2 == 0 or x % 3 == 0 or x % 5 == 0 or x % 7 == 0) return false;
  if (x < 121) return x > 1;
  const u64 d = (x - 1) >> lowbit(x - 1);

  using mint = Mongomery_modint_64<202311020>;

  mint::set_mod(x);
  const mint one(u64(1)), minus_one(x - 1);
  auto ok = [&](u64 a) -> bool {
    auto y = mint(a).pow(d);
    u64 t = d;
    while (y != one && y != minus_one && t != x - 1) y *= y, t <<= 1;
    if (y != minus_one && t % 2 == 0) return false;
    return true;
  };
  if (x < (u64(1) << 32)) {
    for (u64 a: {2, 7, 61})
      if (!ok(a)) return false;
  } else {
    for (u64 a: {2, 325, 9375, 28178, 450775, 9780504, 1795265022}) {
      if (!ok(a)) return false;
    }
  }
  return true;
}
#line 5 "nt/factor.hpp"

template <typename mint>
ll rho(ll n, ll c) {
  assert(n > 1);
  const mint cc(c);
  auto f = [&](mint x) { return x * x + cc; };
  mint x = 1, y = 2, z = 1, q = 1;
  ll g = 1;
  const ll m = 1LL << (__lg(n) / 5);
  for (ll r = 1; g == 1; r <<= 1) {
    x = y;
    FOR(r) y = f(y);
    for (ll k = 0; k < r && g == 1; k += m) {
      z = y;
      FOR(min(m, r - k)) y = f(y), q *= x - y;
      g = gcd(q.val(), n);
    }
  }
  if (g == n) do {
      z = f(z);
      g = gcd((x - z).val(), n);
    } while (g == 1);
  return g;
}

ll find_prime_factor(ll n) {
  assert(n > 1);
  if (primetest(n)) return n;
  FOR(100) {
    ll m = 0;
    if (n < (1 << 30)) {
      using mint = Mongomery_modint_32<20231025>;
      mint::set_mod(n);
      m = rho<mint>(n, RNG(0, n));
    } else {
      using mint = Mongomery_modint_64<20231025>;
      mint::set_mod(n);
      m = rho<mint>(n, RNG(0, n));
    }
    if (primetest(m)) return m;
    n = m;
  }
  assert(0);
  return -1;
}

// ソートしてくれる
vc<pair<ll, int>> factor(ll n) {
  assert(n >= 1);
  vc<pair<ll, int>> pf;
  FOR(p, 2, 100) {
    if (p * p > n) break;
    if (n % p == 0) {
      ll e = 0;
      do { n /= p, e += 1; } while (n % p == 0);
      pf.eb(p, e);
    }
  }
  while (n > 1) {
    ll p = find_prime_factor(n);
    ll e = 0;
    do { n /= p, e += 1; } while (n % p == 0);
    pf.eb(p, e);
  }
  sort(all(pf));
  return pf;
}

vc<pair<ll, int>> factor_by_lpf(ll n, vc<int>& lpf) {
  vc<pair<ll, int>> res;
  while (n > 1) {
    int p = lpf[n];
    int e = 0;
    while (n % p == 0) {
      n /= p;
      ++e;
    }
    res.eb(p, e);
  }
  return res;
}
#line 2 "mod/mod_pow.hpp"

#line 2 "mod/barrett.hpp"

// https://github.com/atcoder/ac-library/blob/master/atcoder/internal_math.hpp
struct Barrett {
  u32 m;
  u64 im;
  explicit Barrett(u32 m = 1) : m(m), im(u64(-1) / m + 1) {}
  u32 umod() const { return m; }
  u32 modulo(u64 z) {
    if (m == 1) return 0;
    u64 x = (u64)(((unsigned __int128)(z)*im) >> 64);
    u64 y = x * m;
    return (z - y + (z < y ? m : 0));
  }
  u64 floor(u64 z) {
    if (m == 1) return z;
    u64 x = (u64)(((unsigned __int128)(z)*im) >> 64);
    u64 y = x * m;
    return (z < y ? x - 1 : x);
  }
  pair<u64, u32> divmod(u64 z) {
    if (m == 1) return {z, 0};
    u64 x = (u64)(((unsigned __int128)(z)*im) >> 64);
    u64 y = x * m;
    if (z < y) return {x - 1, z - y + m};
    return {x, z - y};
  }
  u32 mul(u32 a, u32 b) { return modulo(u64(a) * b); }
};

struct Barrett_64 {
  u128 mod, mh, ml;

  explicit Barrett_64(u64 mod = 1) : mod(mod) {
    u128 m = u128(-1) / mod;
    if (m * mod + mod == u128(0)) ++m;
    mh = m >> 64;
    ml = m & u64(-1);
  }

  u64 umod() const { return mod; }

  u64 modulo(u128 x) {
    u128 z = (x & u64(-1)) * ml;
    z = (x & u64(-1)) * mh + (x >> 64) * ml + (z >> 64);
    z = (x >> 64) * mh + (z >> 64);
    x -= z * mod;
    return x < mod ? x : x - mod;
  }

  u64 mul(u64 a, u64 b) { return modulo(u128(a) * b); }
};
#line 5 "mod/mod_pow.hpp"

u32 mod_pow(int a, ll n, int mod) {
  assert(n >= 0);
  if (mod == 1) return 0;
  a = ((a %= mod) < 0 ? a + mod : a);
  if ((mod & 1) && (mod < (1 << 30))) {
    using mint = Mongomery_modint_32<202311021>;
    mint::set_mod(mod);
    return mint(a).pow(n).val();
  }
  Barrett bt(mod);
  int r = 1;
  while (n) {
    if (n & 1) r = bt.mul(r, a);
    a = bt.mul(a, a), n >>= 1;
  }
  return r;
}

u64 mod_pow_64(ll a, ll n, u64 mod) {
  assert(n >= 0);
  if (mod == 1) return 0;
  a = ((a %= mod) < 0 ? a + mod : a);
  if ((mod & 1) && (mod < (u64(1) << 62))) {
    using mint = Mongomery_modint_64<202311021>;
    mint::set_mod(mod);
    return mint(a).pow(n).val();
  }
  Barrett_64 bt(mod);
  ll r = 1;
  while (n) {
    if (n & 1) r = bt.mul(r, a);
    a = bt.mul(a, a), n >>= 1;
  }
  return r;
}
#line 6 "mod/primitive_root.hpp"

// int

int primitive_root(int p) {
  auto pf = factor(p - 1);
  auto is_ok = [&](int g) -> bool {
    for (auto&& [q, e]: pf)
      if (mod_pow(g, (p - 1) / q, p) == 1) return false;
    return true;
  };
  while (1) {
    int x = RNG(1, p);
    if (is_ok(x)) return x;
  }
  return -1;
}

ll primitive_root_64(ll p) {
  auto pf = factor(p - 1);
  auto is_ok = [&](ll g) -> bool {
    for (auto&& [q, e]: pf)
      if (mod_pow_64(g, (p - 1) / q, p) == 1) return false;
    return true;
  };
  while (1) {
    ll x = RNG(1, p);
    if (is_ok(x)) return x;
  }
  return -1;
}

// https://codeforces.com/contest/1190/problem/F

ll primitive_root_prime_power_64(ll p, ll e) {
  assert(p >= 3);
  ll g = primitive_root_64(p);
  ll q = p;
  ll phi = p - 1;
  FOR(e - 1) {
    q *= p;
    phi *= p;
    if (mod_pow_64(g, phi / p, q) == 1) g += q / p;
  }
  return g;
}
#line 2 "nt/primetable.hpp"

template <typename T = int>
vc<T> primetable(int LIM) {
  ++LIM;
  const int S = 32768;
  static int done = 2;
  static vc<T> primes = {2}, sieve(S + 1);

  if (done < LIM) {
    done = LIM;

    primes = {2}, sieve.assign(S + 1, 0);
    const int R = LIM / 2;
    primes.reserve(int(LIM / log(LIM) * 1.1));
    vc<pair<int, int>> cp;
    for (int i = 3; i <= S; i += 2) {
      if (!sieve[i]) {
        cp.eb(i, i * i / 2);
        for (int j = i * i; j <= S; j += 2 * i) sieve[j] = 1;
      }
    }
    for (int L = 1; L <= R; L += S) {
      array<bool, S> block{};
      for (auto& [p, idx]: cp)
        for (int i = idx; i < S + L; idx = (i += p)) block[i - L] = 1;
      FOR(i, min(S, R - L)) if (!block[i]) primes.eb((L + i) * 2 + 1);
    }
  }
  int k = LB(primes, LIM + 1);
  return {primes.begin(), primes.begin() + k};
}
#line 3 "nt/lpf_table.hpp"

// [0, LIM], 0, 1 には -1 が入る。
vc<int> lpf_table(ll LIM) {
  auto primes = primetable(LIM);
  vc<int> res(LIM + 1, -1);
  FOR_R(i, len(primes)) {
    auto p = primes[i];
    FOR3(j, 1, LIM / p + 1) res[p * j] = p;
  }
  return res;
}
#line 2 "ds/hashmap.hpp"

// u64 -> Val

template <typename Val>
struct HashMap {
  // n は入れたいものの個数で ok

  HashMap(u32 n = 0) { build(n); }
  void build(u32 n) {
    u32 k = 8;
    while (k < n * 2) k *= 2;
    cap = k / 2, mask = k - 1;
    key.resize(k), val.resize(k), used.assign(k, 0);
  }

  // size を保ったまま. size=0 にするときは build すること.

  void clear() {
    used.assign(len(used), 0);
    cap = (mask + 1) / 2;
  }
  int size() { return len(used) / 2 - cap; }

  int index(const u64& k) {
    int i = 0;
    for (i = hash(k); used[i] && key[i] != k; i = (i + 1) & mask) {}
    return i;
  }

  Val& operator[](const u64& k) {
    if (cap == 0) extend();
    int i = index(k);
    if (!used[i]) { used[i] = 1, key[i] = k, val[i] = Val{}, --cap; }
    return val[i];
  }

  Val get(const u64& k, Val default_value) {
    int i = index(k);
    return (used[i] ? val[i] : default_value);
  }

  bool count(const u64& k) {
    int i = index(k);
    return used[i] && key[i] == k;
  }

  // f(key, val)

  template <typename F>
  void enumerate_all(F f) {
    FOR(i, len(used)) if (used[i]) f(key[i], val[i]);
  }

private:
  u32 cap, mask;
  vc<u64> key;
  vc<Val> val;
  vc<bool> used;

  u64 hash(u64 x) {
    static const u64 FIXED_RANDOM = std::chrono::steady_clock::now().time_since_epoch().count();
    x += FIXED_RANDOM;
    x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9;
    x = (x ^ (x >> 27)) * 0x94d049bb133111eb;
    return (x ^ (x >> 31)) & mask;
  }

  void extend() {
    vc<pair<u64, Val>> dat;
    dat.reserve(len(used) / 2 - cap);
    FOR(i, len(used)) {
      if (used[i]) dat.eb(key[i], val[i]);
    }
    build(2 * len(dat));
    for (auto& [a, b]: dat) (*this)[a] = b;
  }
};
#line 4 "mod/modfast.hpp"

template <int p>
struct ModFast {
  static_assert(p < (1 << 30));

  u32 root;
  array<u32, 65537> POW[2];
  array<pair<u16, u16>, 1 + (1 << 20)> FRAC;

  static constexpr int K = 1 << 21;
  array<u32, 2 * K + 1> LOG;
  array<u32, 2 * K + 1> INV;

  ModFast() {
    root = (p == 998244353 ? 3 : primitive_root(p));
    build_pow();
    build_inv();
    build_log();
    build_frac();
  }

  void build_inv() {
    INV[K + 1] = 1;
    for (u32 i = 2; i <= K; ++i) {
      u64 q = (p + i - 1) / i;
      INV[K + i] = INV[K + i * q - p] * u64(q) % p;
    }
    FOR(i, 1, K + 1) INV[K - i] = p - INV[K + i];
  }

  u32 pow(u32 a, ll exp) {
    assert(0 <= a && a < p && 0 <= exp && exp < (1 << 30));
    if (a == 0) return (exp == 0 ? 1 : 0);
    return pow_r_32(log_r(a) * exp % (p - 1));
  }

  u32 pow_r_32(u32 exp) {
    assert(0 <= exp && exp <= p - 1);
    return u64(POW[0][exp & 32767]) * POW[1][exp >> 15] % p;
  }
  u32 pow_r(ll exp) {
    exp %= p - 1;
    if (exp < 0) exp += p - 1;
    return u64(POW[0][exp & 32767]) * POW[1][exp >> 15] % p;
  }

  // [0, 2p-2)
  u32 log_r(u32 x) {
    assert(1 <= x && x < p);
    auto [a, b] = FRAC[x >> 10];
    u32 t = x * b - a * p;
    return LOG[K + t] + (p - 1) - LOG[K + b];
  }

  u32 inverse(u32 x) {
    assert(1 <= x && x < p);
    auto [a, b] = FRAC[x >> 10];
    u32 t = x * b - a * p;
    return INV[K + t] * u64(b) % p;
  }

  template <typename T>
  vc<T> get_log_table(int n) {
    assert(n <= K);
    return {LOG.begin() + K, LOG.begin() + K + n + 1};
  }

 private:
  void build_pow() {
    POW[0][0] = POW[1][0] = 1;
    FOR(i, (1 << 15)) POW[0][i + 1] = POW[0][i] * u64(root) % p;
    FOR(i, (1 << 15)) POW[1][i + 1] = POW[1][i] * u64(POW[0][1 << 15]) % p;
  }

  // 0.085 sec.
  void build_log() {
    const int LIM = 1 << 21;
    auto lpf = lpf_table(LIM);

    const int S = 1 << 17;
    HashMap<u32> MP(S);
    u32 pw = 1;
    for (int k = 0; k < S; ++k, pw = u64(root) * pw % p) {
      MP[pw] = k;
    }
    u32 q = pow_r_32(p - 1 - S);
    auto BSGS = [&](u32 s) -> u32 {
      u32 ans = 0;
      while (1) {
        u32 v = MP.get(s, -1);
        if (v != u32(-1)) {
          return ans + v;
        }
        ans += S, s = u64(s) * q % p;
      }
      return 0;
    };

    LOG[K + 1] = 0;
    FOR(i, 2, 1 + (1 << 21)) {
      if (lpf[i] < i) {
        LOG[K + i] = (LOG[K + lpf[i]] + LOG[K + i / lpf[i]]) % (p - 1);
        continue;
      }
      if (i < 100) {
        LOG[K + i] = BSGS(i);
        continue;
      }
      if (i * i > p) {
        auto [j, k] = divmod<int>(p, i);
        // i = (-k)/j
        LOG[K + i] =
            (LOG[K + k] + (p - 1) / 2 + (p - 1) - LOG[K + j]) % (p - 1);
        continue;
      }
      while (1) {
        u32 k = RNG(0, p - 1);
        u64 ans = p - 1 - k;
        u32 x = u64(i) * pow_r_32(k) % p;
        auto div = [&](u32 q) -> void { x /= q, ans += LOG[K + q]; };
        for (u32 q : {2, 3, 5, 7, 11, 13, 17, 19}) {
          while (x % q == 0) div(q);
        }
        if (x >= LIM) continue;
        while (i < x && x < LIM && lpf[x] < i) div(lpf[x]);
        if (1 < x && x < i) div(x);
        if (x == 1) {
          LOG[K + i] = ans % (p - 1);
          break;
        }
      }
    }
    FOR(i, 1, 1 + (1 << 21)) {
      LOG[K - i] = (LOG[K + i] + (p - 1) / 2) % (p - 1);
    }
  }

  void build_frac() {
    vc<tuple<u16, u16, u16, u16>> que;
    que.eb(0, 1, 1, 1);
    while (len(que)) {
      auto [a, b, c, d] = POP(que);
      if (b + d < 2048) {
        que.eb(a + c, b + d, c, d), que.eb(a, b, a + c, b + d);
        continue;
      }
      u32 s = (u64(a) * p) / (1024 * b);
      u32 t = (u64(c) * p) / (1024 * d);
      FRAC[s] = {a, b}, FRAC[t] = {c, d};
      a = min(a, c), b = min(b, d);
      FOR(i, s + 1, t) FRAC[i] = {a, b};
    }
  }
};