Blame view

GenerationData.py 6.36 KB
6f53e5768   dsotofor   first2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
  import numpy as np
  import pandas as pd
  import statistics
  import random
  #from distfit import distfit
  import csv
  #import math
  from scipy.special import expit
  
  class Generator:
      def __init__(self, base):
          self.base=base
  
      def logit_Normal(self, x):
          return 1/(1+np.exp(-x))
  
      #Generation of dataset with grade and time for 15 questions and 5 complexity levels
      def generationDatasetComplexities(self):
          tbase=pd.DataFrame()
          #Number of complexity levels
          #sigma_grade=1.2#Initial
          #mu_grade=0#initial
          sigma_grade=0.5
          mu_grade=1.5
          sigma_time=1.7
          mu_time=30
          for rows in range(5):#5
              tlist=[]
              #Number of questions
              for ncomp in range(15):#15 #3 for initial
  
                  if ncomp < 10:#Simulate mistakes in complexity level 1 first 3 questions
                      cgrade2=self.logit_Normal(np.random.normal(-1, 0.2, 700))
                  #if rows == 0 and ncomp < 10:#Simulate mistakes in complexity level 1 first 3 questions
                  #    omu_grade=mu_grade
                  #    mu_grade=-2
                  #if rows == 3 and ncomp < 3 :#Simulate mistakes in complexity level 3 first 3 questions
                  #    omu_grade=mu_grade
                  #    mu_grade=-1
                  else:
                      cgrade2=self.logit_Normal(np.random.normal(mu_grade, sigma_grade, 700))
  
                  #Number of questions (grade, time)
                  cgrade=self.logit_Normal(np.random.normal(mu_grade, sigma_grade, 300))
                  cgrade=np.append(cgrade, cgrade2)
                  cgrade=cgrade*10
                  ctime=np.random.normal(mu_time, sigma_time, 1000)
                  #vcomp=np.ones(len(vgrade))*(ncomp+1)
                  result = [cgrade.tolist(), ctime.tolist()]
  
                  tbase[len(tbase.columns)]=cgrade
                  tbase[len(tbase.columns)]=ctime
              #omu_grade+=0.5
              mu_grade-=0.2
              sigma_grade+=0.08
          tbase.to_csv("data.csv", sep=" ")
          
      #Generation of dataset with mean of grade and mean of time for 15 questions and 10 sub-competences
      def generationDatasetMeanSubCompetences(self):
          tbase=[]
          #Number of rows to generate
          for rows in range(1000):
              sigma_grade=1.7
              mu_grade=5
              sigma_time=1.7
              mu_time=30
              tlist=[]
              #Number of sub-competences
              for ncomp in range(10):
                  vgrade=[]
                  vtime=[]
                  #Number of questions (grade, time)
                  for i in range(15):
                      cgrade=np.random.normal(mu_grade, sigma_grade, 1)[0]
                      vgrade.append(cgrade)
                      ctime=np.random.normal(mu_time, sigma_time, 1)[0]
                      vtime.append(ctime)
                  nmu_grade=np.mean(vgrade)
                  nmu_time=np.mean(vtime)
                  vcomp=np.ones(len(vgrade))*(ncomp+1)
                  result = [np.mean(vgrade), np.mean(vtime)]
                  tlist=tlist + result
                  mu_grade=np.random.normal(nmu_grade, 0.5, 1)[0]
                  mu_time=np.random.normal(nmu_time, 0.5, 1)[0]
                  sigma_grade=(abs(mu_grade-nmu_grade))*0.4
                  sigma_time=(abs(mu_time-nmu_time))*0.4
                  #print(tlist)
              tbase.append(tlist)
          #print(tbase)
          #Write the csv file
          with open("dataMean.csv", "w", newline="") as f:
              writer=csv.writer(f)
              writer.writerows(tbase)
  
      def generation(self):
          vlambda = 0.5
          lbase=self.base
          #print(lbase)
          for i in range(100):
              element1=lbase.sample()
              element1=vlambda*np.array(element1)
              element2=lbase.sample()
              element2=(1.0-vlambda)*np.array(element2)
              #print(element1)
              #print(element2)
              #print(element1[0]+element2[0])
              elementN=pd.DataFrame(element1+element2)
              #print(elementN)
  
              #Concatenate self.base and elementN
              
              
          return self.base
          #print(x)
          
          
      #Generation with white noise
      def generation3(self):
          mu, sigma = 0, 0.1
          x=[sum(self.base.iloc[i,:]) for i in range(21)]
          #print(x)
          for i in range(1000):
              element=self.base.sample()
              noise=np.random.normal(mu, sigma, [1, element.shape[1]])
              nbase=[self.base, element+noise]
              self.base=pd.concat(nbase)
          x=[sum(self.base.iloc[i,:]) for i in range(21)]
          return self.base
          #print(x)
  
      def detection(self, data):
          dfit=distfit()
          dfit.fit_transform(data)
          print(dfit.summary)
          
  
      #Generation with detection of distribution for each column
      def generation2(self):
          dfit=distfit()
          lbase=np.array(self.base)
          newData=[]
          for vindex in range(lbase.shape[1]):
              #print("Column: ",lbase[:,vindex])
              dfit.fit_transform(lbase[:,vindex])
              sigma=dfit.model['scale']
              nrand=dfit.generate(1)
              newData.append(nrand)
              lbase=lbase[(lbase[:,vindex] < (nrand + (sigma/1.0))) & (lbase[:,vindex] > (nrand - (sigma/1.0)))]
          print(newData)
          self.base.loc[len(self.base.index)]=newData
          print(self.base.corr())
  
      #Generation with normal distribution
      def generation0(self):
          lbase=self.base
          print(lbase.corr())
          #print(lbase[lbase[20].values==0].corr())
          #print(lbase[lbase[20].values==0].iloc[1:100,:].corr())
          for n in range(100):
              vindex=0
              newData=[]
              lbase=self.base
              for vindex in range(21):
                  #mu=statistics.median(self.base[vindex])
                  mu=statistics.mean(lbase[vindex])
                  sigma=statistics.stdev(lbase[vindex])
                  nrand=np.random.normal(mu, sigma, 1)[0]
                  #print(mu, " ", sigma, nrand)
                  #print(self.base.head())
                  lbase=lbase[(lbase[vindex].values < (nrand + (sigma/100.0))) & (lbase[vindex].values > (nrand - (sigma/100.0)))]
                  newData.append(nrand)
                  #print(lbase)
              #print(newData)
              self.base.loc[len(self.base.index)]=newData
          print(self.base.corr())
  
  
  g=Generator([])
  #g.detection(data)
  g.generationDatasetComplexities()
  #g.generationDatasetMeanSubCompetences()