Velvet Star Monitor

Standout celebrity highlights with iconic style.

updates

Splitting one csv into multiple files

Writer Andrew Henderson

I have a csv file of about 5000 rows in python i want to split it into five files.

I wrote a code for it but it is not working

import codecs
import csv
NO_OF_LINES_PER_FILE = 1000
def again(count_file_header,count): f3 = open('write_'+count_file_header+'.csv', 'at') with open('import_1458922827.csv', 'rb') as csvfile: candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL) co = 0 for row in candidate_info_reader: co = co + 1 count = count + 1 if count <= count: pass elif count >= NO_OF_LINES_PER_FILE: count_file_header = count + NO_OF_LINES_PER_FILE again(count_file_header,count) else: writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL) writer.writerow(row)
def read_write(): f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at') with open('import_1458922827.csv', 'rb') as csvfile: candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL) count = 0 for row in candidate_info_reader: count = count + 1 if count >= NO_OF_LINES_PER_FILE: count_file_header = count + NO_OF_LINES_PER_FILE again(count_file_header,count) else: writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL) writer.writerow(row)
read_write()

The above code creates many fileswith empty content.

How to split one files into five csv files?

12 Answers

In Python

Use readlines() and writelines() to do that, here is an example:

>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1

the output file names will be numbered 1.csv, 2.csv, ... etc.

From terminal

FYI, you can do this from the command line using split as follows:

$ split -l 1000 import_1458922827.csv
7

I suggest you not inventing a wheel. There is existing solution. Source here

import os
def split(filehandler, delimiter=',', row_limit=1000, output_name_template='output_%s.csv', output_path='.', keep_headers=True): import csv reader = csv.reader(filehandler, delimiter=delimiter) current_piece = 1 current_out_path = os.path.join( output_path, output_name_template % current_piece ) current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) current_limit = row_limit if keep_headers: headers = reader.next() current_out_writer.writerow(headers) for i, row in enumerate(reader): if i + 1 > current_limit: current_piece += 1 current_limit = row_limit * current_piece current_out_path = os.path.join( output_path, output_name_template % current_piece ) current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) if keep_headers: current_out_writer.writerow(headers) current_out_writer.writerow(row)

Use it like:

split(open('/your/pat/input.csv', 'r'));
4

A python3-friendly solution:

def split_csv(source_filepath, dest_folder, split_file_prefix, records_per_file): """ Split a source csv into multiple csvs of equal numbers of records, except the last file. Includes the initial header row in each split file. Split files follow a zero-index sequential naming convention like so: `{split_file_prefix}_0.csv` """ if records_per_file <= 0: raise Exception('records_per_file must be > 0') with open(source_filepath, 'r') as source: reader = csv.reader(source) headers = next(reader) file_idx = 0 records_exist = True while records_exist: i = 0 target_filename = f'{split_file_prefix}_{file_idx}.csv' target_filepath = os.path.join(dest_folder, target_filename) with open(target_filepath, 'w') as target: writer = csv.writer(target) while i < records_per_file: if i == 0: writer.writerow(headers) try: writer.writerow(next(reader)) i += 1 except StopIteration: records_exist = False break if i == 0: # we only wrote the header, so delete that file os.remove(target_filepath) file_idx += 1
1

I have modified the accepted answer a little bit to make it simpler

Edited: Added the import statement, modified the print statement for printing the exception. @Alex F code snippet was written for python2, for python3 you also need to use header_row = rows.__next__() instead header_row = rows.next(). Thanks for pointing out.

import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2): count = 0 current_piece = 1 # file_to_split_name.csv file_name = file_location.split("/")[-1].split(".")[0] split_file_name_template = file_name + "__%s.csv" splited_files_path = [] if not os.path.exists(out_dir): os.makedirs(out_dir) try: with open(file_location, "rb") as csv_file: rows = csv.reader(csv_file, delimiter=",") headers_row = rows.next() for row in rows: if count % file_size == 0: current_out_path = os.path.join(out_dir, split_file_name_template%str(current_piece)) current_out_writer = None current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",") current_out_writer.writerow(headers_row) splited_files_path.append(current_out_path) current_piece += 1 current_out_writer.writerow(row) count += 1 return True, splited_files_path except Exception as e: print("Exception occurred as {}".format(e)) return False, splited_files_path
1

@Ryan, Python3 code worked for me. I used newline='' as below to avoid the blank line issue:

with open(target_filepath, 'w', newline='') as target:

Another pandas solution (each 1000 rows), similar to Aziz Alto solution:

suffix = 1
for i in range(len(df)): if i % 1000 == 0: df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False) suffix += 1

where df is the csv loaded as pandas.DataFrame; filename is the original filename, the pipe is a separator; index and index_label false is to skip the autoincremented index columns

2

A simple Python 3 solution with Pandas that doesn't cut off the last batch

def to_csv_batch(src_csv, dst_dir, size=30000, index=False): import pandas as pd import math # Read source csv df = pd.read_csv(src_csv) # Initial values low = 0 high = size # Loop through batches for i in range(math.ceil(len(df) / size)): fname = dst_dir+'/Batch_' + str(i+1) + '.csv' df[low:high].to_csv(fname, index=index) # Update selection low = high if (high + size < len(df)): high = high + size else: high = len(df)

Usage example

to_csv_batch('Batch_All.csv', 'Batches')
if count <= count: pass

This condition is always true so you pass everytime

Otherwise you can look at this post: Splitting a CSV file into equal parts?

I suggest you leverage the possibilities offered by pandas. Here are functions you could use to do that :

def csv_count_rows(file): """ Counts the number of rows in a file. :param file: path to the file. :return: number of lines in the designated file. """ with open(file) as f: nb_lines = sum(1 for line in f) return nb_lines
def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None): """ Split a csv into several files. :param file: path to the original csv. :param sep: View pandas.read_csv doc. :param output_path: path in which to output the resulting parts of the splitting. :param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc. :param chunksize: View pandas.read_csv doc. :param low_memory: View pandas.read_csv doc. :param usecols: View pandas.read_csv doc. """ nb_of_rows = csv_count_rows(file) # Parsing file elements : Path, name, extension, etc... # file_path = "/".join(file.split("/")[0:-1]) file_name = file.split("/")[-1] # file_ext = file_name.split(".")[-1] file_name_trunk = file_name.split(".")[0] split_files_name_trunk = file_name_trunk + "_part_" # Number of chunks to partition the original file into nb_of_chunks = math.ceil(nb_of_rows / nrows) if nrows: log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \ f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \ f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'" logging.debug(log_debug_process_start) for i in range(nb_of_chunks): # Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter). rows_to_skip = range(1, i * nrows) if i else None output_file = f"{output_path}/{split_files_name_trunk}{i}.csv" log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'" logging.debug(log_debug_chunk_processing) # Fetching the original csv file and handling it with skiprows and nrows to process its data df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip, chunksize=chunksize, low_memory=low_memory, usecols=usecols) df_chunk.to_csv(path_or_buf=output_file, sep=sep) log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'" logging.info(log_info_file_output)

And then in your main or jupyter notebook you put :

# This is how you initiate logging in the most basic way.
logging.basicConfig(level=logging.DEBUG)
file = {#Path to your file}
split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False)

P.S.1 : I put nrows = 4000000 because when it's a personal preference. You can change that number if you wish.

P.S.2 : I used the logging library to display messages. When would apply such a function on big files that exist on a remote server, you really want to avoid 'simple printing' and incorporate logging capabilities. You can replace logging.info or logging.debug with print

P.S.3 : Of course, you need to replace the {# Blablabla} parts of the code with your own parameters.

A simpler script works for me.

import pandas as pd
path = "path to file" # path to file
df = pd.read_csv(path) # reading file
low = 0 # Initial Lower Limit
high = 1000 # Initial Higher Limit
while(high < len(df)): df_new = df[low:high] # subsetting DataFrame based on index low = high #changing lower limit high = high + 1000 # givig uper limit with increment of 1000 df_new.to_csv("Path to output file") # output file 
import pandas as pd
df = pd.read_csv('input.csv')
file_len = len(df)
filename = 'output'
n = 1
for i in range(file_len): if i % 10 == 0: sf = (df[i:i+10]) sf.to_csv(f'{filename}_{n}.csv', index=False) n += 1

Building upon the top voted answer, here is a python solution that also includes the headers in each file.

file = open('file.csv', 'r')
header = file.readline()
csvfile = file.readlines()
filename = 1
batch_size = 1000
for i in range(len(csvfile)): if i % batch_size == 0: open(str(filename) + '.csv', 'w+').writelines(header) open(str(filename) + '.csv', 'a+').writelines(csvfile[i:i+batch_size]) filename += 1

This will output the same file names as 1.csv, 2.csv, ... etc.

Your Answer

Sign up or log in

Sign up using Google Sign up using Facebook Sign up using Email and Password

Post as a guest

By clicking “Post Your Answer”, you agree to our terms of service, privacy policy and cookie policy