Spaces:
Runtime error
Runtime error
Terry Zhuo
commited on
Commit
·
b406d3a
1
Parent(s):
3f2364a
update
Browse files- azure_count_ip_data.py +47 -35
azure_count_ip_data.py
CHANGED
|
@@ -10,22 +10,26 @@ from typing import Dict, Set, Tuple, Optional
|
|
| 10 |
from log_reader import RemoteLogReader
|
| 11 |
|
| 12 |
# List of IP addresses we care about
|
| 13 |
-
|
| 14 |
-
"199.111.212.5",
|
| 15 |
-
"175.159.122.63",
|
| 16 |
-
"109.245.193.97",
|
| 17 |
-
"158.195.18.232",
|
| 18 |
-
"2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
|
| 19 |
-
"66.254.231.49",
|
| 20 |
-
"129.74.154.194",
|
| 21 |
-
"175.196.44.217",
|
| 22 |
-
"2601:600:8d00:9510:1d77:b610:9358:f443",
|
| 23 |
-
"74.90.222.68",
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"
|
| 28 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
logging.basicConfig(level=logging.WARNING)
|
| 31 |
log = logging.getLogger(__name__)
|
|
@@ -108,16 +112,19 @@ def get_file_data(content: str) -> Tuple[Optional[str], bool]:
|
|
| 108 |
return None, False
|
| 109 |
|
| 110 |
def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
|
| 111 |
-
"""Count files per
|
| 112 |
# Convert start date string to datetime
|
| 113 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
| 114 |
-
|
| 115 |
|
| 116 |
try:
|
| 117 |
# Get current date for iteration
|
| 118 |
current_date = start_date
|
| 119 |
today = datetime.now()
|
| 120 |
|
|
|
|
|
|
|
|
|
|
| 121 |
while current_date <= today:
|
| 122 |
date_str = current_date.strftime("%Y_%m_%d")
|
| 123 |
|
|
@@ -132,8 +139,9 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
|
|
| 132 |
# Convert messages to file content format
|
| 133 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
| 134 |
ip, vote_conditions_met = get_file_data(content)
|
| 135 |
-
if vote_conditions_met and ip:
|
| 136 |
-
|
|
|
|
| 137 |
|
| 138 |
except Exception as e:
|
| 139 |
log.error(f"Error processing logs for date {date_str}: {e}")
|
|
@@ -144,10 +152,10 @@ def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02
|
|
| 144 |
except Exception as e:
|
| 145 |
log.error(f"Error accessing logs: {e}")
|
| 146 |
|
| 147 |
-
return dict(
|
| 148 |
|
| 149 |
-
def
|
| 150 |
-
"""Download files and organize them by
|
| 151 |
|
| 152 |
Args:
|
| 153 |
reader: RemoteLogReader instance
|
|
@@ -158,6 +166,9 @@ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_
|
|
| 158 |
data_dir = os.path.join(os.getcwd(), "data")
|
| 159 |
os.makedirs(data_dir, exist_ok=True)
|
| 160 |
|
|
|
|
|
|
|
|
|
|
| 161 |
# Convert start date string to datetime
|
| 162 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
| 163 |
|
|
@@ -186,11 +197,12 @@ def download_files_by_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_
|
|
| 186 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
| 187 |
ip = get_ip_from_jsonl(content)
|
| 188 |
|
| 189 |
-
if ip:
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
| 194 |
os.makedirs(valid_dir, exist_ok=True)
|
| 195 |
os.makedirs(invalid_dir, exist_ok=True)
|
| 196 |
|
|
@@ -226,21 +238,21 @@ def main():
|
|
| 226 |
reader = RemoteLogReader()
|
| 227 |
|
| 228 |
# Add argument parser for optional parameters
|
| 229 |
-
parser = argparse.ArgumentParser(description='Download and organize conversation files by
|
| 230 |
parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
|
| 231 |
parser.add_argument('--download', action='store_true', help='Enable file download')
|
| 232 |
args = parser.parse_args()
|
| 233 |
|
| 234 |
# Download files if enabled
|
| 235 |
if args.download:
|
| 236 |
-
print("\nDownloading files and organizing by
|
| 237 |
-
|
| 238 |
|
| 239 |
# Count and display statistics
|
| 240 |
-
|
| 241 |
-
print("\nFile counts per
|
| 242 |
-
for
|
| 243 |
-
print(f"
|
| 244 |
|
| 245 |
if __name__ == "__main__":
|
| 246 |
main()
|
|
|
|
| 10 |
from log_reader import RemoteLogReader
|
| 11 |
|
| 12 |
# List of IP addresses we care about
|
| 13 |
+
WHITELIST_IPS_DICT = {
|
| 14 |
+
"Chen Gong": ["199.111.212.5"],
|
| 15 |
+
"Juyong Jiang": ["175.159.122.63"],
|
| 16 |
+
"Kenneth Hamilton": ["109.245.193.97"],
|
| 17 |
+
"Marek Suppa": ["158.195.18.232"],
|
| 18 |
+
"Max Tian": ["2607:fea8:4f40:4b00:e5b9:9806:6b69:233b"],
|
| 19 |
+
"Mengzhao Jia": ["66.254.231.49"],
|
| 20 |
+
"Noah Ziems": ["129.74.154.194"],
|
| 21 |
+
"Sabina A": ["175.196.44.217"],
|
| 22 |
+
"Wenhao Yu": ["2601:600:8d00:9510:1d77:b610:9358:f443"],
|
| 23 |
+
"Vaisakhi Mishra": ["74.90.222.68"],
|
| 24 |
+
"Kumar Shridhar": ["129.132.145.250"],
|
| 25 |
+
"Viktor Gal": ["2a02:169:3e9:0:6ce8:e76f:faed:c830"],
|
| 26 |
+
"Guangyu Song": ["70.50.179.57"],
|
| 27 |
+
"Bhupesh Bishnoi": ["2a02:842a:24:5a01:8cd6:5b22:1189:6035"],
|
| 28 |
+
"Zheng Liu": ["2408:8418:6390:7603:40b:555f:774:a05d"]
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# Flatten IP list for backward compatibility
|
| 32 |
+
WHITELIST_IPS = [ip for ips in WHITELIST_IPS_DICT.values() for ip in ips]
|
| 33 |
|
| 34 |
logging.basicConfig(level=logging.WARNING)
|
| 35 |
log = logging.getLogger(__name__)
|
|
|
|
| 112 |
return None, False
|
| 113 |
|
| 114 |
def count_files_per_ip(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18") -> Dict[str, int]:
|
| 115 |
+
"""Count files per name from the given start date"""
|
| 116 |
# Convert start date string to datetime
|
| 117 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
| 118 |
+
name_counts = defaultdict(int)
|
| 119 |
|
| 120 |
try:
|
| 121 |
# Get current date for iteration
|
| 122 |
current_date = start_date
|
| 123 |
today = datetime.now()
|
| 124 |
|
| 125 |
+
# Create reverse mapping of IP to name
|
| 126 |
+
ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
|
| 127 |
+
|
| 128 |
while current_date <= today:
|
| 129 |
date_str = current_date.strftime("%Y_%m_%d")
|
| 130 |
|
|
|
|
| 139 |
# Convert messages to file content format
|
| 140 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
| 141 |
ip, vote_conditions_met = get_file_data(content)
|
| 142 |
+
if vote_conditions_met and ip and ip in ip_to_name:
|
| 143 |
+
name = ip_to_name[ip]
|
| 144 |
+
name_counts[name] += 1
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
log.error(f"Error processing logs for date {date_str}: {e}")
|
|
|
|
| 152 |
except Exception as e:
|
| 153 |
log.error(f"Error accessing logs: {e}")
|
| 154 |
|
| 155 |
+
return dict(name_counts)
|
| 156 |
|
| 157 |
+
def download_files_by_name(reader: 'RemoteLogReader', start_date_str: str = "2025_02_18", check_sandbox: bool = True) -> None:
|
| 158 |
+
"""Download files and organize them by annotator name
|
| 159 |
|
| 160 |
Args:
|
| 161 |
reader: RemoteLogReader instance
|
|
|
|
| 166 |
data_dir = os.path.join(os.getcwd(), "data")
|
| 167 |
os.makedirs(data_dir, exist_ok=True)
|
| 168 |
|
| 169 |
+
# Create reverse mapping of IP to name
|
| 170 |
+
ip_to_name = {ip: name for name, ips in WHITELIST_IPS_DICT.items() for ip in ips}
|
| 171 |
+
|
| 172 |
# Convert start date string to datetime
|
| 173 |
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
| 174 |
|
|
|
|
| 197 |
content = '\n'.join(json.dumps(msg) for msg in messages)
|
| 198 |
ip = get_ip_from_jsonl(content)
|
| 199 |
|
| 200 |
+
if ip and ip in ip_to_name:
|
| 201 |
+
name = ip_to_name[ip]
|
| 202 |
+
# Create directory structure for this name
|
| 203 |
+
name_dir = os.path.join(data_dir, name)
|
| 204 |
+
valid_dir = os.path.join(name_dir, "valid")
|
| 205 |
+
invalid_dir = os.path.join(name_dir, "invalid")
|
| 206 |
os.makedirs(valid_dir, exist_ok=True)
|
| 207 |
os.makedirs(invalid_dir, exist_ok=True)
|
| 208 |
|
|
|
|
| 238 |
reader = RemoteLogReader()
|
| 239 |
|
| 240 |
# Add argument parser for optional parameters
|
| 241 |
+
parser = argparse.ArgumentParser(description='Download and organize conversation files by annotator name')
|
| 242 |
parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
|
| 243 |
parser.add_argument('--download', action='store_true', help='Enable file download')
|
| 244 |
args = parser.parse_args()
|
| 245 |
|
| 246 |
# Download files if enabled
|
| 247 |
if args.download:
|
| 248 |
+
print("\nDownloading files and organizing by annotator name...")
|
| 249 |
+
download_files_by_name(reader, check_sandbox=args.sandbox_check)
|
| 250 |
|
| 251 |
# Count and display statistics
|
| 252 |
+
name_counts = count_files_per_ip(reader)
|
| 253 |
+
print("\nFile counts per annotator:")
|
| 254 |
+
for name, count in sorted(name_counts.items(), key=lambda x: x[1], reverse=True):
|
| 255 |
+
print(f"Name: {name:<20} Count: {count}")
|
| 256 |
|
| 257 |
if __name__ == "__main__":
|
| 258 |
main()
|