GenerationData.py 6.36 KB
import numpy as np
import pandas as pd
import statistics
import random
#from distfit import distfit
import csv
#import math
from scipy.special import expit

class Generator:
    def __init__(self, base):
        self.base=base

    def logit_Normal(self, x):
        return 1/(1+np.exp(-x))

    #Generation of dataset with grade and time for 15 questions and 5 complexity levels
    def generationDatasetComplexities(self):
        tbase=pd.DataFrame()
        #Number of complexity levels
        #sigma_grade=1.2#Initial
        #mu_grade=0#initial
        sigma_grade=0.5
        mu_grade=1.5
        sigma_time=1.7
        mu_time=30
        for rows in range(5):#5
            tlist=[]
            #Number of questions
            for ncomp in range(15):#15 #3 for initial

                if ncomp < 10:#Simulate mistakes in complexity level 1 first 3 questions
                    cgrade2=self.logit_Normal(np.random.normal(-1, 0.2, 700))
                #if rows == 0 and ncomp < 10:#Simulate mistakes in complexity level 1 first 3 questions
                #    omu_grade=mu_grade
                #    mu_grade=-2
                #if rows == 3 and ncomp < 3 :#Simulate mistakes in complexity level 3 first 3 questions
                #    omu_grade=mu_grade
                #    mu_grade=-1
                else:
                    cgrade2=self.logit_Normal(np.random.normal(mu_grade, sigma_grade, 700))

                #Number of questions (grade, time)
                cgrade=self.logit_Normal(np.random.normal(mu_grade, sigma_grade, 300))
                cgrade=np.append(cgrade, cgrade2)
                cgrade=cgrade*10
                ctime=np.random.normal(mu_time, sigma_time, 1000)
                #vcomp=np.ones(len(vgrade))*(ncomp+1)
                result = [cgrade.tolist(), ctime.tolist()]

                tbase[len(tbase.columns)]=cgrade
                tbase[len(tbase.columns)]=ctime
            #omu_grade+=0.5
            mu_grade-=0.2
            sigma_grade+=0.08
        tbase.to_csv("data.csv", sep=" ")
        
    #Generation of dataset with mean of grade and mean of time for 15 questions and 10 sub-competences
    def generationDatasetMeanSubCompetences(self):
        tbase=[]
        #Number of rows to generate
        for rows in range(1000):
            sigma_grade=1.7
            mu_grade=5
            sigma_time=1.7
            mu_time=30
            tlist=[]
            #Number of sub-competences
            for ncomp in range(10):
                vgrade=[]
                vtime=[]
                #Number of questions (grade, time)
                for i in range(15):
                    cgrade=np.random.normal(mu_grade, sigma_grade, 1)[0]
                    vgrade.append(cgrade)
                    ctime=np.random.normal(mu_time, sigma_time, 1)[0]
                    vtime.append(ctime)
                nmu_grade=np.mean(vgrade)
                nmu_time=np.mean(vtime)
                vcomp=np.ones(len(vgrade))*(ncomp+1)
                result = [np.mean(vgrade), np.mean(vtime)]
                tlist=tlist + result
                mu_grade=np.random.normal(nmu_grade, 0.5, 1)[0]
                mu_time=np.random.normal(nmu_time, 0.5, 1)[0]
                sigma_grade=(abs(mu_grade-nmu_grade))*0.4
                sigma_time=(abs(mu_time-nmu_time))*0.4
                #print(tlist)
            tbase.append(tlist)
        #print(tbase)
        #Write the csv file
        with open("dataMean.csv", "w", newline="") as f:
            writer=csv.writer(f)
            writer.writerows(tbase)

    def generation(self):
        vlambda = 0.5
        lbase=self.base
        #print(lbase)
        for i in range(100):
            element1=lbase.sample()
            element1=vlambda*np.array(element1)
            element2=lbase.sample()
            element2=(1.0-vlambda)*np.array(element2)
            #print(element1)
            #print(element2)
            #print(element1[0]+element2[0])
            elementN=pd.DataFrame(element1+element2)
            #print(elementN)

            #Concatenate self.base and elementN
            
            
        return self.base
        #print(x)
        
        
    #Generation with white noise
    def generation3(self):
        mu, sigma = 0, 0.1
        x=[sum(self.base.iloc[i,:]) for i in range(21)]
        #print(x)
        for i in range(1000):
            element=self.base.sample()
            noise=np.random.normal(mu, sigma, [1, element.shape[1]])
            nbase=[self.base, element+noise]
            self.base=pd.concat(nbase)
        x=[sum(self.base.iloc[i,:]) for i in range(21)]
        return self.base
        #print(x)

    def detection(self, data):
        dfit=distfit()
        dfit.fit_transform(data)
        print(dfit.summary)
        

    #Generation with detection of distribution for each column
    def generation2(self):
        dfit=distfit()
        lbase=np.array(self.base)
        newData=[]
        for vindex in range(lbase.shape[1]):
            #print("Column: ",lbase[:,vindex])
            dfit.fit_transform(lbase[:,vindex])
            sigma=dfit.model['scale']
            nrand=dfit.generate(1)
            newData.append(nrand)
            lbase=lbase[(lbase[:,vindex] < (nrand + (sigma/1.0))) & (lbase[:,vindex] > (nrand - (sigma/1.0)))]
        print(newData)
        self.base.loc[len(self.base.index)]=newData
        print(self.base.corr())

    #Generation with normal distribution
    def generation0(self):
        lbase=self.base
        print(lbase.corr())
        #print(lbase[lbase[20].values==0].corr())
        #print(lbase[lbase[20].values==0].iloc[1:100,:].corr())
        for n in range(100):
            vindex=0
            newData=[]
            lbase=self.base
            for vindex in range(21):
                #mu=statistics.median(self.base[vindex])
                mu=statistics.mean(lbase[vindex])
                sigma=statistics.stdev(lbase[vindex])
                nrand=np.random.normal(mu, sigma, 1)[0]
                #print(mu, " ", sigma, nrand)
                #print(self.base.head())
                lbase=lbase[(lbase[vindex].values < (nrand + (sigma/100.0))) & (lbase[vindex].values > (nrand - (sigma/100.0)))]
                newData.append(nrand)
                #print(lbase)
            #print(newData)
            self.base.loc[len(self.base.index)]=newData
        print(self.base.corr())


g=Generator([])
#g.detection(data)
g.generationDatasetComplexities()
#g.generationDatasetMeanSubCompetences()