/**
 * Prime Counting - https://primecounting.com
 * Copyright (c) 2025 Nathan McKenzie
 * GitHub: https://github.com/NathanMcKenzie
 * 
 * This file is part of the Prime Counting website.
 * 
 * Permission is hereby granted to use, copy, modify, and/or distribute this code
 * for any purpose with or without fee, provided that the above copyright notice
 * and this permission notice appear in all copies or substantial portions of the code.
 * 

 * Version: 1.0.0
 * Created: July 2025
 */

 /*
    The goal of this file is to show a single-threaded, mildly optimized C++ implementation of a prime counting function that
    can compute pi(10^14) in around 2 or 3 seconds in under 64,000 bytes of RAM.
    To help confirm the memory claims in particular, the program uses Windows functions to keep track of
        the memory the algorithm uses.
 */

#include "stdio.h"
#include "stdlib.h"
#include "math.h"
#include "time.h"

#include <windows.h>
#include <psapi.h>

const double EPSILON = .00000000001;

const int wheelCyclePeriod = 30030, wheelFirstPrime = 17, wheelCycleEntries = 5760, wheelDiscaredPrimes = 6;

unsigned short wheelTranslation[wheelCyclePeriod];
unsigned char wheelOffsets[wheelCycleEntries];
void MakeWheel() {
    int entry = 0, offset = 1, total = 1, primes[] = { 2,3,5,7,11,13,17,19,23,29,31,37 };
    wheelTranslation[0] = wheelTranslation[1] = 0;
    wheelTranslation[2] = 1;
    for (int i = 3; i <= wheelCyclePeriod; i+=2) {
        bool inUse = true;
        for (int j = 1; j < wheelDiscaredPrimes && inUse; j++) if (!(i % primes[j])) inUse = false;
        if (inUse) {
            wheelOffsets[entry] = offset + 1;
            offset = 0;
            entry++;
        }
        else offset++;
        wheelTranslation[i] = total;
        if( inUse )total++;
        offset++;
        wheelTranslation[i+1] = total;
    }
    wheelOffsets[entry] = 2;
}
inline void IncrementWheelEntry(int& offset) {
    if (++offset >= wheelCycleEntries)offset = 0;
}
inline long long InversePower(long long x, long y) {
    return ((long long)(pow((double)x + EPSILON, (1.0 / (double)y)) + EPSILON));
}
inline int NumToWheelEntry( long long n){
    return (n < wheelCyclePeriod) ? wheelTranslation[n] : wheelTranslation[n % wheelCyclePeriod];
}
inline long long wheelCount(long long rangeStart, long long rangeEnd) {
    if (++rangeEnd < wheelCyclePeriod)return wheelTranslation[rangeEnd] - wheelTranslation[rangeStart];
    int b = rangeEnd % wheelCyclePeriod;
    if (rangeStart < wheelCyclePeriod) return (rangeEnd - b) / wheelCyclePeriod * wheelCycleEntries + wheelTranslation[b] - wheelTranslation[rangeStart];
    int a = rangeStart % wheelCyclePeriod;
    return (rangeEnd - b - rangeStart + a) / wheelCyclePeriod * wheelCycleEntries + wheelTranslation[b] - wheelTranslation[a];
}
long long D_2(long long n, long long a, long long p, int r ) {
    int nroot = (long long)(sqrt((double)n) + EPSILON);
    long long t = (wheelCount(a, n / a) - 1) * (p / r) + (wheelCount(a, nroot) - 1) * (p / 2) + p / (r * (r + 1));
    
    int wheelEntry = NumToWheelEntry(a);
    a += wheelOffsets[wheelEntry];
    IncrementWheelEntry(wheelEntry);
    
    long long u = 0;
    while (a <= nroot) {
        u += wheelCount(a, n / a)-1;
        a += wheelOffsets[wheelEntry];
        IncrementWheelEntry(wheelEntry);
    }

    return t + u * p;
}
long long D_k(long long n, long long a, long long p, int r, int k ) {
    if( k == 2)return D_2(n,a,p,r );
    long long t = 0;

    t += D_k(n / a, a, p / r, r + 1, k - 1);

    int wheelEntry = NumToWheelEntry(a);
    a += wheelOffsets[wheelEntry];
    IncrementWheelEntry(wheelEntry);
    
    while (a <= InversePower(n, k)) {
        t += D_k(n / a, a, p, 2, k - 1);
        a += wheelOffsets[wheelEntry];
        IncrementWheelEntry(wheelEntry);
    }
    return t;
}
long long D(long long n, int k) {
    if( k == 1 )return wheelCount(wheelFirstPrime, n);
    if( k == 2 )return D_2(n, wheelFirstPrime, 2, 1);
    long long factorial[] = { 0, 1, 2, 6, 24, 120, 720, 5040, 40320, 362880, 3628800, 39916800, 479001600, 6227020800, 87178291200,
        1307674368000, 20922789888000, 355687428096000, 6402373705728000, 121645100408832000, 2432902008176640000
    };
    return D_k(n, wheelFirstPrime, factorial[k], 1, k );
}

long long countPrimes(long long n) {
    int mu[] = { 0, 1, -1, -1, 0, -1, 1, -1, 0, 0, 1, -1, 0, -1, 1, 1, 0, -1, 0, -1, 0, 1, 1, -1, 0, 0, 1, 0, 0 };
    long long totals[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

    for (int j = 1; j < (int)(log((double)n + EPSILON) / log((double)wheelFirstPrime + EPSILON) + EPSILON) + 1; j++) {
        if (!mu[j])continue;
        for (int k = 1; k < (int)(log((double)n + EPSILON) / j / log((double)wheelFirstPrime + EPSILON) + EPSILON) + 1; k++) {
            totals[j * k] += D(InversePower(n, j), k) * pow(-1.0, k + 1) * mu[j];
        }
    }
    if (n < wheelFirstPrime) {
        static const int lookup[] = { 0,0,1,2,2,3,3,4,4,4,4,5,5,6 };
        return lookup[n];
    }
    double t = wheelDiscaredPrimes;
    for( int i = 1; i < 32; i++ )t += totals[i]/i;
    return (long long)(t + 0.5);
}

void printSpaces(const char* str) {
    printf("                                                                    %s", str);
}

int main(){
    const int scaleNum = 10;
    int oldClock = (int)clock(), lastDif = 0;

    // Warm up Functions - ensure certain library functions are loaded prior to GetProcessMemoryInfo so that they don't count towards memory budget
    log(3.4); pow(3.3, 2); sqrt(4.4); clock(); printf(""); printf("%17I64d(10^%4.1f): ", 3.5, log((double)3.5) / log(10.0));

    PROCESS_MEMORY_COUNTERS pmc_before, pmc_after;

    GetProcessMemoryInfo(GetCurrentProcess(), &pmc_before, sizeof(pmc_before));
    MakeWheel();
    GetProcessMemoryInfo(GetCurrentProcess(), &pmc_after, sizeof(pmc_after));
    SIZE_T wheelMemory = pmc_after.WorkingSetSize - pmc_before.WorkingSetSize;

    // Warm up Functions 2 - ensure certain library functions are loaded prior to GetProcessMemoryInfo so that they don't count towards memory budget
    for (long long i = scaleNum; i <= 1000; i *= scaleNum) {
        long long total = countPrimes(i);
        printf(" %20lld", total);
    }
    system("cls");

    printSpaces("Time\n");
    printSpaces("Increase\n");
    printSpaces("");
    printf("for x%d\n", scaleNum);
    printf("         __ Input Number __   __ Output Number __ _ MSec _ _ Sec _  Input\n\n");

    oldClock = (int)clock(); lastDif = 0;
    
    GetProcessMemoryInfo(GetCurrentProcess(), &pmc_before, sizeof(pmc_before));
    for (long long i = scaleNum; i <= 100000000000000; i *= scaleNum) {
        printf("%17I64d(10^%4.1f): ", i, log((double)i) / log(10.0));
        long long total = countPrimes(i);
        int newClock = (int)clock();
        printf(" %20lld %8d : %4d: x%f\n", total, newClock - oldClock, (newClock - oldClock) / CLK_TCK,
            (lastDif) ? (double)(newClock - oldClock) / (double)lastDif : 0.0);
        lastDif = newClock - oldClock;
        oldClock = newClock;
    }
    GetProcessMemoryInfo(GetCurrentProcess(), &pmc_after, sizeof(pmc_after));

    printf("\nMemory for fixed size, input independent precomputed prime factorization wheels: %zu bytes\n",
        wheelMemory);
    printf("\nTotal memory footprint of algorithm during execution: %zu bytes\n",
        (pmc_after.WorkingSetSize - pmc_before.WorkingSetSize));
    printf("(Stack memory would be included in this number on 4k byte page boundaries, so 0 means 4k bytes was never passed)\n");

    return 0;
}