Many cryptographic systems depend upon the difficulty in factorization of a large composite number into it's primes. There exists no polynomial time algorithm for this problem. Is it possible to solve this problem using statistical learning?
For the purposes of illustration, consider this simple example: 7 x 23 = 161
The task is to factorize 161. It is evident that one of the primes will be less than sqrt(161) and the other greater. Could an algorithm be developed that would, for any number in the range 1 to sqrt(161), give the probability of the true prime factor being above it. In this example, if 5 is given, the algorithm will return a probability close to 1 (indicating the prime factor is higher than 5); if 13 is given, it will return a probability close to 0.
What follows is an attempt to train a simple Random Forest Classifier using a dataset created from the first 50 million primes (found here). The data is available here. The Makefile should rebuild the prime number list.
The necessary imports:
import random
import math
import gzip
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.metrics import roc_auc_score
Functions for loading the prime numbers and making a training set:
# Load the list of prime numbers from file (available at the address above)
def load_prime_numbers():
PrimeNumberList = gzip.open('primes.txt.gz', 'rb').read().splitlines()
PrimeNumberList = [long(p) for p in PrimeNumberList]
return PrimeNumberList
def build_primes_and_product(PrimeNumberList, SampleSize = None):
#builds a test set of size SampleSize from the PrimeNumberList
#Returns a list of [[prime_number_1, primt_number_2, product]...]
PrimesAndProduct = []
if SampleSize == None:
SampleSize = 100000
for i in range(SampleSize):
a = PrimeNumberList[random.randint(0, (SampleSize-1))]
b = PrimeNumberList[random.randint(0, (SampleSize-1))]
if a > b:
a, b = b, a
PrimesAndProduct.append([a, b, a*b])
return PrimesAndProduct
def build_test_data_set(PrimesAndProduct, TestSetSize = None):
#For each product and smaller of the primes,
#return (product, random number less/greater than smaller of the primes,
# 0 for smaller or 1 for greater)
if TestSetSize == None:
TestSetSize = 1000000
TrainingSet = []
for i in PrimesAndProduct:
prime1, prime2, product = i
SqrtOfProduct = long(product**0.5)
FlipCoin = random.randint(0,1)
if FlipCoin == 0:
ChosenNumber = long(random.randrange(1, prime1))
target = 0
else:
ChosenNumber = long(random.randrange(prime1-1, SqrtOfProduct))
target = 1
TrainingSet.append([product,
ChosenNumber,
target,
prime1])
return TrainingSet
def extract_features(Sample):
#make the training data
#take a row of data from PrimesAndProduct,
#return a list of features and target
FeatureList = []
for i in Sample:
product, chosen, target, prime1 = i
product_split = list(str(product).zfill(20))
product_split.extend(list(str(chosen).zfill(20)))
product_split = [int(h) for h in product_split]
product_split.extend([target, prime1, chosen])
FeatureList.append(product_split)
return FeatureList
Functions to fit various classifiers and return the fit model
def learn_cross_validation(features, target):
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
clf = RandomForestClassifier(n_estimators = 400, n_jobs=20)
scores = cross_validation.cross_val_score(clf, features, target, cv=5)
print scores
def learn_RF(features, target):
from sklearn.ensemble import RandomForestClassifier
clr = RandomForestClassifier(n_estimators = 400, n_jobs=20)
clr.fit(features, target)
return clr
def learn_LR(features, target):
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
clr.fit_transform(features, target)
return clr
Make a test set with evenly spaced out numbers between 1 and sqrt(composite) along with target
def meaningful_test(product, prime1, TestSetSize):
if TestSetSize == None:
TestSetSize = 10000
TrainingSet = []
for i in range(1, TestSetSize):
SqrtOfProduct = long(product**0.5) * 1.2
ChosenNumber = long(SqrtOfProduct * i) / long(TestSetSize)
if ChosenNumber < prime1:
target = 0
else:
target = 1
TrainingSet.append([product,
ChosenNumber,
target,
prime1])
return TrainingSet
Build training set
p = load_prime_numbers()
pnp = build_primes_and_product(p, 100000)
r=build_test_data_set(pnp)
test = pd.DataFrame(extract_features(r))
Train a model
model = learn_RF(test.loc[:,0:39], test.loc[:,40])
#model = learn_LR(test.loc[:,0:39], test.loc[:,40])
ROC_AUC of the training set - has trained (very ?) well
h = model.predict(test.loc[:,0:39])
print roc_auc_score(test.loc[:,40], h)
pd.crosstab(h, test.loc[:,40])
ROC_AUC on a new test set
new_test_data = pd.DataFrame(extract_features(build_test_data_set((build_primes_and_product(p, 10000)))))
h = model.predict(new_test_data.loc[:,0:39])
print roc_auc_score(new_test_data.loc[:,40], h)
pd.crosstab(h, new_test_data.loc[:,40])
ROC_AUC on a meaningful test set. The graph displays the predicted transition in probability in relation to the prime factor
rand_prime1, rand_prime2, rand_product = pnp[random.randint(0, len(pnp))]
meaningful_test_data = pd.DataFrame(extract_features(meaningful_test(rand_product, rand_prime1, 10000)))
h = model.predict(meaningful_test_data.loc[:,0:39])
print rand_product, rand_prime1
print roc_auc_score(meaningful_test_data.loc[:,40], h)
plt.scatter(meaningful_test_data[42], h)
plt.scatter(rand_prime1, 0.5, c='r')
pd.crosstab(h, meaningful_test_data.loc[:,40])
Graphical representation:
No comments:
Post a Comment